diff --git a/CODEOWNERS b/CODEOWNERS
index 5fc20409c276c87843fad370fe7ba726ef762970..2410b3ebc09b225c6af8f461cd55f0d6e7a0f529 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,7 +7,7 @@
 
 
 # top-level repo folders
-/.github/ @jeffra @mrwyattii
+/.github/ @jeffra @mrwyattii @loadams
 /azure/ @jeffra @awan-10
 /benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
 /bin/ @jeffra
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
deleted file mode 160000
index 20ea07a2a069696abec212e25476a9bf76aced70..0000000000000000000000000000000000000000
--- a/DeepSpeedExamples
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 20ea07a2a069696abec212e25476a9bf76aced70
diff --git a/LICENSE b/LICENSE
index 9e841e7a26e4eb057b24511e7b92d42b257a80e5..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,21 +1,201 @@
-    MIT License
-
-    Copyright (c) Microsoft Corporation.
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in all
-    copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index bfa03a6e8c9a8d2ead26e306d56d57cf42c67647..e801daba4dbf6fc8bf55fb4ca10ed918e35d251b 100755
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
-[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
+[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 [![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
 [![Downloads](https://pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
 [![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
+[![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)
 
 
 <div align="center">
@@ -10,9 +11,11 @@
 </div>
 
 ## Latest News
-<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
+<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
+* ***[2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)*** [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
+* [2023/03] [Scaling Large-Scale Generative Mixture-of-Expert Multimodal Model With VL-MoE](https://www.deepspeed.ai/2023/03/30/multi-modal.html)
+* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
 * [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
 * [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
 * [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
@@ -23,7 +26,7 @@
 
 # Extreme Speed and Scale for DL Training and Inference
 
-[DeepSpeed](https://www.deepspeed.ai/) is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
+***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can:
 
 * Train/Inference dense or sparse models with billions or trillions of parameters
 * Achieve excellent system throughput and efficiently scale to thousands of GPUs
@@ -94,8 +97,8 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
 <img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
 | <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
-| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
-| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
+| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) |
+| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
 | <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |
 
 ---
@@ -104,11 +107,12 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 
 | Description | Status |
 | ----------- | ------ |
-| NVIDIA | [![nv-torch12-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch12-p40.yml) [![nv-torch18-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch18-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
-| AMD | [![amd](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd.yml) |
-| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
-| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) |
-| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)|
+| NVIDIA | [![nv-torch19-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch19-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch19-p40.yml) [![nv-torch19-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch19-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch19-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
+| AMD | [![amd-mi100](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml) [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
+| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) |
+| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
+| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml)[![nv-megatron](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml)[![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) |
+| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
 
 # Installation
 
@@ -122,7 +126,7 @@ dynamically link them at runtime.
 
 ## Requirements
 * [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
-* For full feature support we recommend a version of PyTorch that is >= 1.8 and ideally the latest PyTorch stable release.
+* For full feature support we recommend a version of PyTorch that is >= 1.9 and ideally the latest PyTorch stable release.
 * A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
 * Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
   * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
@@ -174,7 +178,12 @@ All DeepSpeed documentation, tutorials, and blogs can be found on our website: [
 # Contributing
 DeepSpeed welcomes your contributions! Please see our
 [contributing](CONTRIBUTING.md) guide for more details on formatting, testing,
-etc.
+etc.<br/>
+Thanks so much to all of our amazing contributors!
+
+<a href="https://github.com/microsoft/DeepSpeed/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=microsoft/DeepSpeed&r="  width="800px"/>
+</a>
 
 ## Contributor License Agreement
 This project welcomes contributions and suggestions. Most contributions require you to
@@ -210,6 +219,10 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
 15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
 16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
+17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017).
+18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
+19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226).
+20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
 
 
 # Videos
diff --git a/README_HIP.md b/README_HIP.md
deleted file mode 100755
index f1f4ac40e60bcd44706da5e085825e83f9c33d14..0000000000000000000000000000000000000000
--- a/README_HIP.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# DeepSpeed
-
-## 安装
-DeepSpeed 支持
-+ Python 3.7.
-+ Python 3.8.
-+ Python 3.9.
-
-### 使用pip安装
-DeepSpeed whl包下载目录：[https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04](https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04)
-根据对应的pytorch版本和python版本，下载对应deepspeed的whl包
-
-```shell
-pip install deepspeed* (下载的deepspeed的whl包)
-```
-
-### 使用源码安装
-编译之前，需要先安装对应版本python，安装相应的三方包依赖项，并配置DTK环境变量(以Centos7.x为例)。
-
-pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)
-
-根据python版本,下载对应pytorch的whl包。如果是基于pytorch1.13，需要注释掉op_builder/builder.py中大概L659： 
-```bash
-#sources[i] = str(src.relative_to(curr_file))
-```
-安装依赖项：
-```bash
-# 安装三方包的源
-yum install  epel-release  -y
-
-# 安装相关依赖项
-                        
-yum install libffi-devel -y
-yum -y install openssl openssl-devel
-                       
-yum install -y libaio-devel
-yum install -y libaio
-ls -l
-
-# 若python内未包含相关项，需基于上面安装的三方包重新源码编译python，再配置python环境
-python3 -m pip install --upgrade pip setuptools
-pip3 install wheel -i https://pypi.tuna.tsinghua.edu.cn/simple
-pip3 install ninja -i https://pypi.tuna.tsinghua.edu.cn/simple
-```
-
-下载DTK并配置环境变量:
-```bash
-# DTK tar包下载目录：光合社区/资源工具/DCU Toolkit/DTK23.04(https://cancon.hpccube.com:65024/1/main/DTK-23.04)，根据系统选择对应DTK的tar包，并解压至/opt目录。
-# 如果使用的是dtk23.04前的版本，可以参考以图片下方式修改torch中的hipify文件
-export ROCM_PATH=/opt/dtk-23.04
-source /opt/dtk-23.04/env.sh
-```
-![logo](hipify_20230511113250.png)
-
-
-编译deepspeed
-
-```bash
-# 下载源码 
-git clone -b ds-v0.8.2-rocm http://developer.hpccube.com/codes/aicomponent/deepspeed.git
-cd deepspeed
-sh requirements/run_pip.sh
-export DS_BUILD_VERSION=abix.dtkxxx
-DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
-```
-
-安装deepspeed
-
-```bash
-# nni的whl包会在dist文件夹生成
-pip3 install ./dist/deepspeed*
-```
-
-## Note
-+ 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-+ DS_BUILD_VERSION为编译的版本号设置，版本号为0.8.2+gitxxx.abix.dtkxxx 
-gitxxx：为代码自动获取；abi0：使用devtools的gcc编译；abi1：使用非devtools的gcc编译; dtkxxx为dtk的版本号：例如：dtk2304
\ No newline at end of file
diff --git a/accelerator/__init__.py b/accelerator/__init__.py
index d29c3b15467b979c02068488a4ac849e082286f1..c0d9a7bf36efea0bce67652d6df0a8027a34eb63 100644
--- a/accelerator/__init__.py
+++ b/accelerator/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .abstract_accelerator import DeepSpeedAccelerator
 from .real_accelerator import get_accelerator, set_accelerator
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 03dfbe9df43dad69ce024c450f99ea2f6b794e6b..dc26edf26faf9232c0900fa5c214bea67043c8b9 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -1,10 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import abc
 from abc import ABC
 
 
 class DeepSpeedAccelerator(ABC):
+
     def __init__(self):
         self._name = None
         self._communication_backend_name = None
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index cdf4cd447764fb4337643f181d9a1e9b12b7344f..36341a3c19b3b0a1dd4e55636cfc618fd62ae8a1 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pkgutil
@@ -14,6 +17,7 @@ except ImportError:
 
 
 class CUDA_Accelerator(DeepSpeedAccelerator):
+
     def __init__(self):
         self._name = 'cuda'
         self._communication_backend_name = 'nccl'
@@ -22,13 +26,10 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
         # put all valid class name <--> class type mapping into class_dict
         op_builder_dir = self.op_builder_dir()
         op_builder_module = importlib.import_module(op_builder_dir)
-
         for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
             # avoid self references
             if module_name != 'all_ops' and module_name != 'builder':
-                module = importlib.import_module("{}.{}".format(
-                    op_builder_dir,
-                    module_name))
+                module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
                 for member_name in module.__dir__():
                     if member_name.endswith(
                             'Builder'
@@ -224,8 +225,9 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
 
     def op_builder_dir(self):
         try:
-            # during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
-            import op_builder  # noqa: F401
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401
             return "op_builder"
         except ImportError:
             return "deepspeed.ops.op_builder"
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 06cbb0b08e7a60075fde9500bbca7495799b5589..a31846f53aa92ccbfcdbe14aa41785d60b37374c 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 try:
     from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
@@ -23,13 +26,8 @@ def _validate_accelerator(accel_obj):
     # accelerator.abstractor_accelerator
     # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
     # is a conforming object
-    if not ((dsa1 != None and isinstance(accel_obj,
-                                         dsa1)) or
-            (dsa2 != None and isinstance(accel_obj,
-                                         dsa2))):
-        raise AssertionError(
-            f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
-        )
+    if not ((dsa1 != None and isinstance(accel_obj, dsa1)) or (dsa2 != None and isinstance(accel_obj, dsa2))):
+        raise AssertionError(f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator')
 
     # TODO: turn off is_available test since this breaks tests
     #assert accel_obj.is_available(), \
diff --git a/azure/attach.sh b/azure/attach.sh
deleted file mode 100644
index c23127b0fb61f62188f41aa2677e97c8121b0131..0000000000000000000000000000000000000000
--- a/azure/attach.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-name=${1-deepspeed}
-docker exec -i -w /home/deepspeed -t $name /bin/bash
diff --git a/azure/azure_config.json b/azure/azure_config.json
deleted file mode 100644
index 9c61e4d3705c34442c7d53b2be24792b8cd61ca3..0000000000000000000000000000000000000000
--- a/azure/azure_config.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "num_vms": 2,
-  "location": "southcentralus",
-  "azure_sku": "Standard_NV6_Promo",
-  "ssh_private_key": "id_rsa",
-  "docker_ssh_port": 2222
-}
diff --git a/azure/azure_ssh.sh b/azure/azure_ssh.sh
deleted file mode 100644
index 3259a3c88341a4e35099cbbb34b105f0d737c6aa..0000000000000000000000000000000000000000
--- a/azure/azure_ssh.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-
-config_file=azure_config.json
-if [ ! -f ${config_file} ]; then
-    echo "Cannot find $config_file"
-    exit 1
-fi
-
-location=`cat ${config_file} | jq .location | sed 's/"//g'`
-rg=deepspeed_rg_$location
-
-while getopts 'c:' flag; do
-  case "${flag}" in
-    c) config_file="${OPTARG}" ;;
-    *) error "Unexpected option ${flag}" ;;
-  esac
-done
-shift $(expr $OPTIND - 1)
-echo "Using $config_file"
-
-nodeid=$1
-cmds=${@:2}
-echo $nodeid $cmds
-ip_addr=`az vm list-ip-addresses -g $rg | jq .[${nodeid}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
-
-ssh_private_key=`cat ${config_file} | jq .ssh_private_key | sed 's/"//g'`
-if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
-
-ssh -i ${ssh_private_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null deepspeed@${ip_addr} ${cmds}
diff --git a/azure/build_docker_image.sh b/azure/build_docker_image.sh
deleted file mode 100644
index e8617f0844f5abe4696f5968c08bc8387d442699..0000000000000000000000000000000000000000
--- a/azure/build_docker_image.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-docker build -t deepspeed:0.1 -f ../Dockerfile .
diff --git a/azure/create_vms.sh b/azure/create_vms.sh
deleted file mode 100644
index 257a011f035c73f81c62ab8084871f6df5a010fb..0000000000000000000000000000000000000000
--- a/azure/create_vms.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-azure_config=azure_config.json
-
-# Make sure jq is installed
-command -v jq
-if [ $? != 0 ]; then
-    echo "Missing dependency of jq, please 'apt-get install jq'"
-    exit 1
-fi
-
-if [ ! -f ${azure_config} ]; then
-    echo "Cannot find $azure_config"
-    exit 1
-fi
-cat $azure_config
-
-num_vms=`cat ${azure_config} | jq .num_vms`
-if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
-location=`cat ${azure_config} | jq .location | sed 's/"//g'`
-if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
-azure_sku=`cat ${azure_config} | jq .azure_sku | sed 's/"//g'`
-if [ $azure_sku == "null" ]; then echo 'missing azure_sku in config'; exit 1; fi
-ssh_private_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
-if [ $ssh_private_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
-ssh_key=${ssh_private_key}.pub
-
-if [ ! -f ${ssh_private_key} ]; then
-    echo "Cannot find $ssh_private_key"
-    exit 1
-fi
-if [ ! -f ${ssh_key} ]; then
-    echo "Cannot find $ssh_key"
-    exit 1
-fi
-
-resource_group=deepspeed_rg_$location
-az group create --name ${resource_group} --location $location
-
-base_vm_name=deepspeed
-vm_image="nvidia:ngc_azure_17_11:ngc_gpu_cloud_19_11_3:19.11.3"
-
-az vm image terms accept --urn ${vm_image}
-
-for i in `seq 0 $(( num_vms - 1))`; do
-    vm_name=${base_vm_name}_$i
-    echo "creating $vm_name"
-    az vm create \
-      --resource-group ${resource_group} \
-      --name ${vm_name} \
-      --image ${vm_image} \
-      --admin-username deepspeed \
-      --size ${azure_sku} \
-      --ssh-key-values ${ssh_key}
-done
diff --git a/azure/setup_docker.sh b/azure/setup_docker.sh
deleted file mode 100644
index 7b8d5cfcdd51e88c562a82d344302d235f8bc865..0000000000000000000000000000000000000000
--- a/azure/setup_docker.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-
-azure_config=azure_config.json
-if [ ! -f ${azure_config} ]; then
-    echo "Cannot find $azure_config"
-    exit 1
-fi
-location=`cat ${azure_config} | jq .location | sed 's/"//g'`
-rg=deepspeed_rg_$location
-
-parallel=true
-command -v pdsh
-if [ $? != 0 ]; then
-    echo "Installing pdsh will allow for the docker pull to be done in parallel across the cluster. See: 'apt-get install pdsh'"
-    parallel=false
-fi
-
-ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
-if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
-num_vms=`cat ${azure_config} | jq .num_vms`
-if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
-
-args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
-username=deepspeed
-
-update_script="
-docker pull deepspeed/deepspeed:latest;
-ln -s workdir/DeepSpeed/azure/attach.sh attach.sh;
-cd workdir/DeepSpeed;
-git pull;
-git submodule update --init --recursive;
-bash azure/start_container.sh;
-"
-
-if [ $parallel == true ]; then
-    echo "parallel docker pull"
-    hosts=""
-    for node_id in {0..1}; do
-        addr=`az vm list-ip-addresses  -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
-        hosts="${addr},${hosts}"
-    done
-     PDSH_RCMD_TYPE=ssh  PDSH_SSH_ARGS_APPEND=${args} pdsh -w $hosts -l ${username} $update_script
-else
-    echo "sequential docker pull"
-    for node_id in `seq 0 $((num_vms - 1))`; do
-        ip_addr=`az vm list-ip-addresses  -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
-        addr=${username}@${ip_addr}
-        ssh ${args} $addr $update_script
-    done
-fi
diff --git a/azure/setup_vms.sh b/azure/setup_vms.sh
deleted file mode 100644
index 118bed2ce7279fc62086e913ca66f2fdba01b710..0000000000000000000000000000000000000000
--- a/azure/setup_vms.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-azure_config=azure_config.json
-if [ ! -f ${azure_config} ]; then
-    echo "Cannot find $azure_config"
-    exit 1
-fi
-location=`cat ${azure_config} | jq .location | sed 's/"//g'`
-rg=deepspeed_rg_$location
-
-ssh_key=`cat ${azure_config} | jq .ssh_private_key | sed 's/"//g'`
-if [ $ssh_key == "null" ]; then echo 'missing ssh_private_key in config'; exit 1; fi
-docker_ssh_port=`cat ${azure_config} | jq .docker_ssh_port`
-if [ $docker_ssh_port == "null" ]; then echo 'missing docker_ssh_port in config'; exit 1; fi
-
-username=deepspeed
-args="-i ${ssh_key} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
-
-num_vms=`az vm list  -g $rg | jq '. | length'`
-first_ip_addr=`az vm list-ip-addresses  -g $rg | jq .[0].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
-num_slots=`ssh $args ${username}@${first_ip_addr} 'nvidia-smi -L | wc -l'`
-echo "number of slots per vm: $num_slots"
-
-hostfile=hostfile
-ssh_config=config
-echo -n "" > $hostfile
-echo -n "" > $ssh_config
-for node_id in `seq 0 $((num_vms - 1))`; do
-    private_ip_addr=`az vm list-ip-addresses  -g $rg | jq .[${node_id}].virtualMachine.network.privateIpAddresses[0] | sed 's/"//g'`
-    echo "worker-${node_id} slots=${num_slots}" >> hostfile
-    echo "Host worker-${node_id}
-    HostName ${private_ip_addr}
-    Port ${docker_ssh_port}
-    StrictHostKeyChecking no
-    " >> ${ssh_config}
-done
-
-update_script="
-sudo mkdir -p /job;
-sudo chmod -R 777 /job;
-mkdir -p workdir;
-git clone https://github.com/microsoft/DeepSpeed.git workdir/DeepSpeed;
-"
-
-for node_id in `seq 0 $((num_vms - 1))`; do
-    ip_addr=`az vm list-ip-addresses  -g $rg | jq .[${node_id}].virtualMachine.network.publicIpAddresses[0].ipAddress | sed 's/"//g'`
-    addr=${username}@${ip_addr}
-    echo "copying ssh keys, ssh config, hostfile to worker-${node_id}"
-    ssh $args ${addr} $update_script
-    scp $args ${ssh_key}* ${addr}:.ssh/
-    scp $args ${ssh_config} ${addr}:.ssh/
-    scp $args ${hostfile} ${addr}:/job/
-done
-rm $hostfile $ssh_config
diff --git a/azure/shutdown_vms.sh b/azure/shutdown_vms.sh
deleted file mode 100644
index 75317118be436ef487544f8edef2e26f4a4829d0..0000000000000000000000000000000000000000
--- a/azure/shutdown_vms.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-
-azure_config=azure_config.json
-if [ ! -f ${azure_config} ]; then
-    echo "Cannot find $azure_config"
-    exit 1
-fi
-
-delete=0
-while getopts 'd' flag; do
-  case "${flag}" in
-    d) delete=1 ;;
-    *)
-        echo "Unexpected option ${flag}"
-        exit 1
-        ;;
-  esac
-done
-
-num_vms=`cat ${azure_config} | jq .num_vms`
-if [ $num_vms == "null" ]; then echo 'missing num_vms in config'; exit 1; fi
-location=`cat ${azure_config} | jq .location | sed 's/"//g'`
-if [ $location == "null" ]; then echo 'missing location in config'; exit 1; fi
-
-base_vm_name=deepspeed
-resource_group=deepspeed_rg_$location
-
-for i in `seq 0 $(( num_vms - 1))`; do
-    vm_name=${base_vm_name}_$i
-    if [ $delete == 0 ]; then
-        echo "deallocating $vm_name"
-        az vm deallocate --resource-group $resource_group --name $vm_name --no-wait
-    else
-        echo "deleting $vm_name"
-        az vm delete -y --resource-group $resource_group --name $vm_name --no-wait
-    fi
-done
diff --git a/azure/start_container.sh b/azure/start_container.sh
deleted file mode 100644
index 7e6aae5406b652700ec20bf3736aaeb70d89d0f0..0000000000000000000000000000000000000000
--- a/azure/start_container.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-name=${1-deepspeed}
-image=deepspeed/deepspeed:latest
-echo "starting docker image named $name"
-docker run -d -t --name $name \
-        --network host \
-        -v ${HOME}/workdir:/home/deepspeed/workdir \
-        -v ${HOME}/.ssh:/home/deepspeed/.ssh \
-        -v /job/hostfile:/job/hostfile \
-        --gpus all $image bash -c 'sudo service ssh start && sleep infinity'
diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c88b2dd091ce23f27c4712f5cd8269bffca1bc4
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,6 @@
+# DeepSpeed Benchmarks
+
+If you are looking for DeepSpeed benchmarks, please see the following resources:
+
+1. [Communication Benchmarking Suite](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/communication)
+2. [Inference Benchmarks](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference)
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..0000000000000000000000000000000000000000
--- a/benchmarks/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/README.md b/benchmarks/communication/README.md
deleted file mode 100644
index f760465b5c972f9b6364e7bbcfb11871ac079cf1..0000000000000000000000000000000000000000
--- a/benchmarks/communication/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Running Communication Benchmarks
-
-
-To run benchmarks, there are two options:
-
-1. Run a single communication operation:
-
-For example, run with a single large message size:
-<pre>
-deepspeed all_reduce.py
-</pre>
-
-Scan across message sizes:
-<pre>
-deepspeed all_reduce.py --scan
-</pre>
-
-2. Run all available communication benchmarks:
-
-<pre>
-deepspeed run_all.py
-</pre>
-
-Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
-
-<pre>
-usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
-                [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --local_rank LOCAL_RANK
-  --trials TRIALS       Number of timed iterations
-  --warmups WARMUPS     Number of warmup (non-timed) iterations
-  --maxsize MAXSIZE     Max message size as a power of 2
-  --async-op            Enables non-blocking communication
-  --bw-unit {Gbps,GBps}
-  --backend {nccl}      Communication library to use
-  --dist {deepspeed,torch}
-                        Distributed DL framework to use
-  --scan                Enables scanning all message sizes
-  --raw                 Print the message size and latency without units
-  --all-reduce          Run all_reduce
-  --all-gather          Run all_gather
-  --all-to-all          Run all_to_all
-  --pt2pt               Run pt2pt
-  --broadcast           Run broadcast
-  --dtype DTYPE         PyTorch tensor dtype
-  --mem-factor MEM_FACTOR
-                        Proportion of max available GPU memory to use for single-size evals
-  --debug               Enables all_to_all debug prints
-</pre>
-
-Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
-
-<pre>
-<path to deepspeed>/bin/ds_bench --scan --trials=10
-</pre>
-
-Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
-
-<pre>
-deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
-</pre>
-
-
-# Adding Communication Benchmarks
-
-To add new communication benchmarks, follow this general procedure:
-
-1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
-2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
-3. Replace comm op calls in new file with find-replace
-4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
-5. Add new comm op to `run_all.py`
diff --git a/benchmarks/communication/__init__.py b/benchmarks/communication/__init__.py
deleted file mode 100644
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..0000000000000000000000000000000000000000
--- a/benchmarks/communication/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
diff --git a/benchmarks/communication/all_gather.py b/benchmarks/communication/all_gather.py
deleted file mode 100644
index dc97267b384020e408739017c3b9051434211b34..0000000000000000000000000000000000000000
--- a/benchmarks/communication/all_gather.py
+++ /dev/null
@@ -1,159 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-# Run all_gather and print metrics
-def timed_all_gather(input, output, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        # use all_gather_base if available
-        if args.dist == 'torch':
-            if hasattr(torch.distributed, "_all_gather_base"):
-                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
-            else:
-                output_tensors = list(
-                    torch.chunk(output_tensor,
-                                cdb.get_world_size(group)))
-                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
-        elif args.dist == 'deepspeed':
-            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        # use all_gather_base if available
-        if args.dist == 'torch':
-            if hasattr(torch.distributed, "_all_gather_base"):
-                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
-            else:
-                output_tensors = list(
-                    torch.chunk(output_tensor,
-                                cdb.get_world_size(group)))
-                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
-        elif args.dist == 'deepspeed':
-            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_gather', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_gather(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'all_gather')
-    global_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    if args.scan:
-        # Create list of message sizes
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-                # Delete original mat to avoid OOM
-                del mat
-                get_accelerator().empty_cache()
-                output = torch.zeros(input.nelement() * world_size,
-                                     dtype=getattr(
-                                         torch,
-                                         args.dtype)).to(
-                                             get_accelerator().device_name(local_rank))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_gather(input, output, args)
-    else:
-        # all_gather_base saves memory
-        if (args.dist == 'torch'
-                and hasattr(torch.distributed,
-                            "_all_gather_base")) or (args.dist == 'deepspeed'
-                                                     and dist.has_allgather_base):
-            mem_factor = args.mem_factor + 0.2
-        else:
-            mem_factor = args.mem_factor
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        sync_all()
-        elements_per_gpu = max_numel(comm_op='all_gather',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=mem_factor,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            # multiply each GPU's tensor by the rank to ease debugging
-            input = ((mat.mul_(float(global_rank))).view(-1))
-            # Delete original mat to avoid OOM
-            del mat
-            get_accelerator().empty_cache()
-            output = torch.zeros(
-                elements_per_gpu * world_size,
-                dtype=getattr(torch,
-                              args.dtype)).to(get_accelerator().device_name(local_rank))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-
-        sync_all()
-        timed_all_gather(input, output, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_gather(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_reduce.py b/benchmarks/communication/all_reduce.py
deleted file mode 100644
index edc1b99301c06e7c8c4b5807e35ad2afa39bf17b..0000000000000000000000000000000000000000
--- a/benchmarks/communication/all_reduce.py
+++ /dev/null
@@ -1,113 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_reduce(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.all_reduce(input, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.all_reduce(input, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_reduce', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_reduce(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'all_reduce')
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_reduce(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so we double mem_factor
-        elements_per_gpu = max_numel(comm_op='all_reduce',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_all_reduce(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_reduce(local_rank=rank, args=args)
diff --git a/benchmarks/communication/all_to_all.py b/benchmarks/communication/all_to_all.py
deleted file mode 100644
index bd35cf290e4c0f35b2b2ce4b2c4ea17f876e52ec..0000000000000000000000000000000000000000
--- a/benchmarks/communication/all_to_all.py
+++ /dev/null
@@ -1,134 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_all_to_all(input, output, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.all_to_all_single(output, input, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.all_to_all_single(output, input, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('all_to_all', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_all_to_all(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-    # Prepare benchmark header
-    print_header(args, 'all_to_all')
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-                output = (mat.clone().view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_all_to_all(input, output, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        elements_per_gpu = max_numel(comm_op='all_to_all',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
-            input = ((mat.mul_(float(global_rank))).view(-1))
-            # Delete original mat to avoid OOM
-            del mat
-            get_accelerator().empty_cache()
-            output = torch.zeros(
-                elements_per_gpu,
-                dtype=getattr(torch,
-                              args.dtype)).to(get_accelerator().device_name(local_rank))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-
-        if args.debug:
-            for i in range(world_size):
-                if i == global_rank:
-                    print(f"Before AllToAll Input List at rank {global_rank}: {input}")
-                dist.barrier()
-
-        timed_all_to_all(input, output, args)
-
-        if args.debug:
-            for i in range(world_size):
-                if i == global_rank:
-                    print(f"AllToAll Results at rank {global_rank}: {output}")
-                dist.barrier()
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_all_to_all(local_rank=rank, args=args)
diff --git a/benchmarks/communication/broadcast.py b/benchmarks/communication/broadcast.py
deleted file mode 100644
index 633e46638fac775920b89b0fc2ab9b0f4401dc79..0000000000000000000000000000000000000000
--- a/benchmarks/communication/broadcast.py
+++ /dev/null
@@ -1,114 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_broadcast(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        dist.broadcast(input, 0, async_op=args.async_op)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        dist.broadcast(input, 0, async_op=args.async_op)
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('broadcast', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_broadcast(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'broadcast')
-
-    world_size = dist.get_world_size()
-    global_rank = dist.get_rank()
-
-    if args.scan:
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_broadcast(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so we double mem_factor
-        elements_per_gpu = max_numel(comm_op='broadcast',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_broadcast(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_broadcast(local_rank=rank, args=args)
diff --git a/benchmarks/communication/constants.py b/benchmarks/communication/constants.py
deleted file mode 100644
index 935927acd174256fe7cd552a3181977e2dfdb7d8..0000000000000000000000000000000000000000
--- a/benchmarks/communication/constants.py
+++ /dev/null
@@ -1,10 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-from deepspeed.accelerator import get_accelerator
-
-DEFAULT_WARMUPS = 5
-DEFAULT_TRIALS = 50
-DEFAULT_TYPE = 'float'
-DEFAULT_BACKEND = get_accelerator().communication_backend_name()
-DEFAULT_UNIT = 'Gbps'
-DEFAULT_DIST = 'deepspeed'
-DEFAULT_MAXSIZE = 24
diff --git a/benchmarks/communication/pt2pt.py b/benchmarks/communication/pt2pt.py
deleted file mode 100644
index 1c890fc42e93585a067ef815f163a96d069096ef..0000000000000000000000000000000000000000
--- a/benchmarks/communication/pt2pt.py
+++ /dev/null
@@ -1,132 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-import time
-
-
-def timed_pt2pt(input, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    sync_all()
-    # Warmups, establish connections, etc.
-    for i in range(args.warmups):
-        if dist.get_rank() == 0:
-            if args.async_op:
-                dist.isend(input, 1)
-            else:
-                dist.send(input, 1)
-        if dist.get_rank() == 1:
-            if args.async_op:
-                dist.irecv(input, src=0)
-            else:
-                dist.recv(input, src=0)
-    sync_all()
-
-    # time the actual comm op trials times and average it
-    pre = time.perf_counter()
-    for i in range(args.trials):
-        if dist.get_rank() == 0:
-            if args.async_op:
-                dist.isend(input, 1)
-            else:
-                dist.send(input, 1)
-        if dist.get_rank() == 1:
-            if args.async_op:
-                dist.irecv(input, src=0)
-            else:
-                dist.recv(input, src=0)
-
-    sync_all()
-    duration = time.perf_counter() - pre
-
-    # maintain and clean performance data
-    avg_duration = duration / args.trials
-    size = input.element_size() * input.nelement()
-    n = dist.get_world_size()
-    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
-    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
-    desc = f'{input.nelement()}x{input.element_size()}'
-
-    if not args.raw:
-        size = convert_size(size)
-
-    print_rank_0(
-        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
-
-
-def run_pt2pt(local_rank, args):
-    if args.dist == 'torch':
-        import torch.distributed as dist
-    elif args.dist == 'deepspeed':
-        import deepspeed.comm as dist
-
-    # Prepare benchmark header
-    print_header(args, 'pt2pt')
-    global_rank = dist.get_rank()
-    world_size = dist.get_world_size()
-
-    if args.scan:
-        # Create list of message sizes
-        M_LIST = []
-        for x in (2**p for p in range(1, args.maxsize)):
-            M_LIST.append(x)
-
-        sync_all()
-        # loop over various tensor sizes
-        for M in M_LIST:
-            global_rank = dist.get_rank()
-            try:
-                mat = torch.ones(world_size,
-                                 M,
-                                 dtype=getattr(
-                                     torch,
-                                     args.dtype)).to(
-                                         get_accelerator().device_name(local_rank))
-                sync_all()
-                input = ((mat.mul_(float(global_rank))).view(-1))
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    if dist.get_rank() == 0:
-                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
-                    sync_all()
-                    break
-            sync_all()
-            timed_pt2pt(input, args)
-    else:
-        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
-        # Don't need output tensor, so double mem_factor
-        elements_per_gpu = max_numel(comm_op='pt2pt',
-                                     dtype=getattr(torch,
-                                                   args.dtype),
-                                     mem_factor=args.mem_factor * 2,
-                                     local_rank=local_rank,
-                                     args=args)
-        try:
-            mat = torch.ones(elements_per_gpu,
-                             dtype=getattr(torch,
-                                           args.dtype)).to(
-                                               get_accelerator().device_name(local_rank))
-            input = ((mat.mul_(float(global_rank))).view(-1))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                if dist.get_rank() == 0:
-                    print(
-                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
-                    )
-                sync_all()
-                return
-        sync_all()
-        timed_pt2pt(input, args)
-
-
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    init_processes(local_rank=rank, args=args)
-    run_pt2pt(local_rank=rank, args=args)
diff --git a/benchmarks/communication/run_all.py b/benchmarks/communication/run_all.py
deleted file mode 100644
index 7ec562cc9ae0dcc101477117a6158b54a1f4272a..0000000000000000000000000000000000000000
--- a/benchmarks/communication/run_all.py
+++ /dev/null
@@ -1,49 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-from benchmarks.communication.utils import *
-from benchmarks.communication.all_reduce import run_all_reduce
-from benchmarks.communication.all_gather import run_all_gather
-from benchmarks.communication.all_to_all import run_all_to_all
-from benchmarks.communication.pt2pt import run_pt2pt
-from benchmarks.communication.broadcast import run_broadcast
-from benchmarks.communication.constants import *
-
-
-# For importing
-def main(args, rank):
-
-    init_processes(local_rank=rank, args=args)
-
-    ops_to_run = []
-    if args.all_reduce:
-        ops_to_run.append('all_reduce')
-    if args.all_gather:
-        ops_to_run.append('all_gather')
-    if args.broadcast:
-        ops_to_run.append('broadcast')
-    if args.pt2pt:
-        ops_to_run.append('pt2pt')
-    if args.all_to_all:
-        ops_to_run.append('all_to_all')
-
-    if len(ops_to_run) == 0:
-        ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
-
-    for comm_op in ops_to_run:
-        if comm_op == 'all_reduce':
-            run_all_reduce(local_rank=rank, args=args)
-        if comm_op == 'all_gather':
-            run_all_gather(local_rank=rank, args=args)
-        if comm_op == 'all_to_all':
-            run_all_to_all(local_rank=rank, args=args)
-        if comm_op == 'pt2pt':
-            run_pt2pt(local_rank=rank, args=args)
-        if comm_op == 'broadcast':
-            run_broadcast(local_rank=rank, args=args)
-
-
-# For directly calling benchmark
-if __name__ == "__main__":
-    args = benchmark_parser().parse_args()
-    rank = args.local_rank
-    main(args, rank)
diff --git a/benchmarks/communication/utils.py b/benchmarks/communication/utils.py
deleted file mode 100644
index b913dda14fe552cd7dc2f9fb46d878f5366d4c2a..0000000000000000000000000000000000000000
--- a/benchmarks/communication/utils.py
+++ /dev/null
@@ -1,220 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import os
-import math
-import argparse
-from benchmarks.communication.constants import *
-from deepspeed.accelerator import get_accelerator
-
-global dist
-
-
-def init_torch_distributed(backend):
-    global dist
-    import torch.distributed as dist
-    torch.distributed.init_process_group(backend)
-    local_rank = int(os.environ['LOCAL_RANK'])
-    get_accelerator().set_device(local_rank)
-
-
-def init_deepspeed_comm(backend):
-    global dist
-    import deepspeed
-    import deepspeed.comm as dist
-    deepspeed.init_distributed(dist_backend=backend)
-    local_rank = int(os.environ['LOCAL_RANK'])
-    get_accelerator().set_device(local_rank)
-
-
-def init_processes(local_rank, args):
-    if args.dist == 'deepspeed':
-        init_deepspeed_comm(args.backend)
-    elif args.dist == 'torch':
-        init_torch_distributed(args.backend)
-    else:
-        print_rank_0(f"distributed framework {args.dist} not supported")
-        exit(0)
-
-
-def print_rank_0(message):
-    if dist.get_rank() == 0:
-        print(message)
-
-
-def print_header(args, comm_op):
-    if comm_op == 'pt2pt':
-        world_size = 2
-    else:
-        world_size = dist.get_world_size()
-    tput = f'Throughput ({args.bw_unit})'
-    busbw = f'BusBW ({args.bw_unit})'
-    header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
-    duration_str = 'Duration'
-    if args.raw:
-        duration_str += ' (us)'
-    header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
-    header += "----------------------------------------------------------------------------------------------------"
-    print_rank_0(header)
-
-
-def get_bw(comm_op, size, duration, args):
-    n = dist.get_world_size()
-    tput = 0
-    busbw = 0
-    if comm_op == "all_to_all":
-        tput = (size / duration)
-        busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_gather":
-        size *= n
-        tput = (size / duration)
-        busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_reduce":
-        tput = (size * 2 / duration)
-        busbw = (size / duration) * (2 * (n - 1) / n)
-    elif comm_op == "pt2pt" or comm_op == "broadcast":
-        tput = (size / duration)
-        busbw = tput
-    else:
-        print_rank_0("wrong comm_op specified")
-        exit(0)
-
-    if args.bw_unit == 'Gbps':
-        tput *= 8
-        busbw *= 8
-
-    return tput, busbw
-
-
-def get_metric_strings(args, tput, busbw, duration):
-    duration_ms = duration * 1e3
-    duration_us = duration * 1e6
-    tput = f'{tput / 1e9:.3f}'
-    busbw = f'{busbw /1e9:.3f}'
-
-    if duration_us < 1e3 or args.raw:
-        duration = f'{duration_us:.3f}'
-        if not args.raw:
-            duration += ' us'
-    else:
-        duration = f'{duration_ms:.3f} ms'
-    return tput, busbw, duration
-
-
-def sync_all():
-    get_accelerator().synchronize()
-    dist.barrier()
-
-
-def max_numel(comm_op, dtype, mem_factor, local_rank, args):
-    dtype_size = _element_size(dtype)
-    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
-    if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
-    elif comm_op == 'all_gather':
-        # all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
-        # Therefore, divide by world size and round down to nearest power of 2
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
-        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
-    elif comm_op == 'all_to_all':
-        # Number of elements must be divisible by world_size
-        # all_to_all performance is lower for non-powers of two. Round down like all_gather.
-        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
-        elements_per_gpu = int(dist.get_world_size() *
-                               round(elements_per_gpu / dist.get_world_size()))
-        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
-    else:
-        print(f"This communication operation: {comm_op} is not supported yet")
-        exit(0)
-    return elements_per_gpu
-
-
-# Helper function to pretty-print message sizes
-def convert_size(size_bytes):
-    if size_bytes == 0:
-        return "0B"
-    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
-    i = int(math.floor(math.log(size_bytes, 1024)))
-    p = math.pow(1024, i)
-    s = round(size_bytes / p, 2)
-    return "%s %s" % (s, size_name[i])
-
-
-# Copied from torch. Need to add the func here for old torch compatibility.
-def _element_size(dtype):
-    """
-    Returns the element size for a dtype, in bytes
-    """
-    if not isinstance(dtype, torch.dtype):
-        raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
-
-    if dtype.is_complex:
-        return torch.finfo(dtype).bits >> 2
-    elif dtype.is_floating_point:
-        return torch.finfo(dtype).bits >> 3
-    elif dtype == torch.bool:
-        # NOTE: torch.bool is not supported in torch.iinfo()
-        return 1
-    else:
-        return torch.iinfo(dtype).bits >> 3
-
-
-def benchmark_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--local_rank", type=int)
-    parser.add_argument("--trials",
-                        type=int,
-                        default=DEFAULT_TRIALS,
-                        help='Number of timed iterations')
-    parser.add_argument("--warmups",
-                        type=int,
-                        default=DEFAULT_WARMUPS,
-                        help='Number of warmup (non-timed) iterations')
-    parser.add_argument("--maxsize",
-                        type=int,
-                        default=24,
-                        help='Max message size as a power of 2')
-    parser.add_argument("--async-op",
-                        action="store_true",
-                        help='Enables non-blocking communication')
-    parser.add_argument("--bw-unit",
-                        type=str,
-                        default=DEFAULT_UNIT,
-                        choices=['Gbps',
-                                 'GBps'])
-    parser.add_argument("--backend",
-                        type=str,
-                        default=DEFAULT_BACKEND,
-                        choices=['nccl',
-                                 'ccl'],
-                        help='Communication library to use')
-    parser.add_argument("--dist",
-                        type=str,
-                        default=DEFAULT_DIST,
-                        choices=['deepspeed',
-                                 'torch'],
-                        help='Distributed DL framework to use')
-    parser.add_argument("--scan",
-                        action="store_true",
-                        help='Enables scanning all message sizes')
-    parser.add_argument("--raw",
-                        action="store_true",
-                        help='Print the message size and latency without units')
-    parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
-    parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
-    parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
-    parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
-    parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
-    parser.add_argument("--dtype",
-                        type=str,
-                        default=DEFAULT_TYPE,
-                        help='PyTorch tensor dtype')
-    parser.add_argument(
-        "--mem-factor",
-        type=float,
-        default=.4,
-        help='Proportion of max available GPU memory to use for single-size evals')
-    parser.add_argument("--debug",
-                        action="store_true",
-                        help='Enables all_to_all debug prints')
-    return parser
diff --git a/benchmarks/inference/bert-bench.py b/benchmarks/inference/bert-bench.py
deleted file mode 100644
index 9d586d033cd7b375f5e82dfef53445199846630f..0000000000000000000000000000000000000000
--- a/benchmarks/inference/bert-bench.py
+++ /dev/null
@@ -1,92 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype", type=str, default="fp16", help="fp16 or fp32")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank", type=int, default=0, help="local rank")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
-    # trim warmup queries
-    latency_set = latency_set[warmup:]
-    count = len(latency_set)
-    if count > 0:
-        latency_set.sort()
-        n50 = (count - 1) * 0.5 + 1
-        n90 = (count - 1) * 0.9 + 1
-        n95 = (count - 1) * 0.95 + 1
-        n99 = (count - 1) * 0.99 + 1
-        n999 = (count - 1) * 0.999 + 1
-
-        avg = sum(latency_set) / count
-        p50 = latency_set[int(n50) - 1]
-        p90 = latency_set[int(n90) - 1]
-        p95 = latency_set[int(n95) - 1]
-        p99 = latency_set[int(n99) - 1]
-        p999 = latency_set[int(n999) - 1]
-
-        print(f"====== latency stats {title} ======")
-        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
-        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
-        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
-        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
-        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
-        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-print(args.model, args.max_tokens, args.dtype)
-
-if args.dtype.lower() == "fp16":
-    dtype = torch.float16
-else:
-    dtype = torch.float32
-
-pipe = pipeline("fill-mask", model=args.model, framework="pt", device=args.local_rank)
-
-if dtype == torch.half:
-    pipe.model.half()
-
-mask = pipe.tokenizer.mask_token
-
-br = pipe(f"Hello I'm a {mask} model")
-if args.deepspeed:
-    pipe.model = deepspeed.init_inference(pipe.model,
-                                          dtype=dtype,
-                                          mp_size=1,
-                                          replace_with_kernel_inject=args.kernel_inject,
-                                          enable_cuda_graph=args.graphs)
-    pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
-    get_accelerator().synchronize()
-    start = time.time()
-    r = pipe(f"Hello I'm a {mask} model")
-    get_accelerator().synchronize()
-    end = time.time()
-    responses.append(r)
-    times.append((end - start))
-    mtimes += pipe.model.model_times()
-    #print(f"{pipe.model.model_times()=}")
-
-print_latency(times, "e2e latency")
-print_latency(mtimes, "model latency")
-
-print(responses[0:3])
diff --git a/benchmarks/inference/collect_results.py b/benchmarks/inference/collect_results.py
deleted file mode 100644
index 0e51033114db848d2d2ff14b2f33b009a2090672..0000000000000000000000000000000000000000
--- a/benchmarks/inference/collect_results.py
+++ /dev/null
@@ -1,147 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import re
-import argparse
-import pandas as pd
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--results-dir",
-    "-r",
-    type=str,
-    default="./results",
-    help="directory containing sweep results",
-)
-parser.add_argument("--version",
-                    "-v",
-                    type=int,
-                    default=0,
-                    help="version to be collected")
-parser.add_argument("--gen-text-n",
-                    "-n",
-                    type=int,
-                    default=1,
-                    help="expected number of generated text")
-parser.add_argument("--output",
-                    "-o",
-                    type=str,
-                    default="./results.csv",
-                    help="output file")
-args = parser.parse_args()
-
-
-def get_branch(file_path):
-    match = re.match(r".*\/(.*)\.log", file_path)
-    if match is None:
-        return False
-    else:
-        return match.groups()[0]
-
-
-def get_benchmark_params(root_dir, file_path):
-    match = re.match(
-        rf"{root_dir}\/(.+?)_(fp\d+)_(true|false)_(true|false)_(\d+)gpus_v(\d+)\/",
-        file_path,
-    )
-    if match is None:
-        return False
-    else:
-        model, dtype, graphs, kernel, gpus, version = match.groups()
-        bool_dict = {"true": True, "false": False}
-        return {
-            "model": model,
-            "dtype": dtype,
-            "graphs": bool_dict[graphs.lower()],
-            "kernel": bool_dict[kernel.lower()],
-            "gpus": int(gpus),
-            "version": int(version),
-        }
-
-
-def get_perf_data(file_content):
-    matches = re.findall(r"\s+(.+?)\sLatency:\s+(\d+\.\d+)\sms", file_content)
-    if matches is []:
-        return False
-    else:
-        return {f"latency-{key}": float(val) for key, val in matches}
-
-
-def get_generated_text(file_content, gen_text_n):
-    file_content = file_content.replace("\n", " ")
-    file_content = file_content.replace("\t", " ")
-    matches = re.findall(r"RESPONSE\s(\d+):\s+[-]{30}\s+(.+?)\s+[-]{30}", file_content)
-    if len(matches) != gen_text_n:
-        return False
-    else:
-        return {f"generated-text-{key}": val for key, val in matches}
-
-
-def get_error(file_content):
-    matches = re.findall(r"Error:\s+(.+?)\n", file_content)
-    if matches is []:
-        return False
-    else:
-        return {f"error": val for val in matches}
-
-
-if __name__ == "__main__":
-    # List to collect data from all benchmarks
-    benchmarks_data = []
-
-    # Walk through directory of results from sweep.sh
-    for root, dirs, files in os.walk(args.results_dir):
-        # Because of how some models are named, the dir structure for results can vary, e.g.:
-        # "EleutherAI/gpt-neo_*/baseline.log" versus "gpt2_*/baseline.log"
-        if dirs:
-            continue
-
-        # Get data from baseline and each tested branch
-        for name in files:
-            file_path = os.path.join(root, name)
-
-            branch = get_branch(file_path)
-            if not branch:
-                print(f"WARNING: Could not detect branch for file {file_path}, skipping")
-                continue
-
-            params = get_benchmark_params(args.results_dir, file_path)
-            if not params:
-                print(
-                    f"WARNING: Could not detect benchmark settings for file {file_path}, skipping"
-                )
-                continue
-
-            # Verify that the version matches that which we want to collect
-            if params["version"] != args.version:
-                continue
-
-            with open(file_path, "r") as f:
-                file_content = f.read()
-
-            perf_data = get_perf_data(file_content)
-            if not perf_data:
-                print(
-                    f"WARNING: Could not detect benchmark performance data for file {file_path}"
-                )
-
-            generated_text = get_generated_text(file_content, args.gen_text_n)
-            if not generated_text:
-                print(f"WARNING: Could not detect generated text for file {file_path}")
-
-            error = get_error(file_content)
-            if error:
-                print(f"Error found in {file_path}, collecting error info...")
-                benchmarks_data.append({"branch": branch, **params, **error})
-                continue
-
-            benchmarks_data.append({
-                "branch": branch,
-                **params,
-                **perf_data,
-                **generated_text
-            })
-
-    # Convert to a DataFrame and save
-    benchmarks_df = pd.DataFrame(benchmarks_data)
-    benchmarks_df.to_csv(args.output)
diff --git a/benchmarks/inference/gpt-bench.py b/benchmarks/inference/gpt-bench.py
deleted file mode 100644
index 29578b30cf1faf1638bbd0a84865ee9c283e9443..0000000000000000000000000000000000000000
--- a/benchmarks/inference/gpt-bench.py
+++ /dev/null
@@ -1,124 +0,0 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-
-import os
-import torch
-import time
-import deepspeed
-import argparse
-from transformers import pipeline
-from deepspeed.accelerator import get_accelerator
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--model", "-m", type=str, help="hf model name")
-parser.add_argument("--deepspeed", action="store_true", help="use deepspeed inference")
-parser.add_argument("--dtype",
-                    type=str,
-                    default="fp16",
-                    choices=["fp16",
-                             "fp32",
-                             "int8"],
-                    help="int8, fp16, or fp32")
-parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
-parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
-parser.add_argument("--max-tokens", type=int, default=50, help="max new tokens")
-parser.add_argument("--local_rank",
-                    type=int,
-                    default=int(os.getenv("LOCAL_RANK",
-                                          "0")),
-                    help="local rank")
-parser.add_argument("--world_size",
-                    type=int,
-                    default=int(os.getenv("WORLD_SIZE",
-                                          "1")),
-                    help="world size")
-parser.add_argument("--trials", type=int, default=30, help="number of trials")
-args = parser.parse_args()
-
-
-def print_latency(latency_set, title, warmup=3):
-    # trim warmup queries
-    latency_set = list(latency_set)
-    latency_set = latency_set[warmup:]
-    count = len(latency_set)
-    if count > 0:
-        latency_set.sort()
-        n50 = (count - 1) * 0.5 + 1
-        n90 = (count - 1) * 0.9 + 1
-        n95 = (count - 1) * 0.95 + 1
-        n99 = (count - 1) * 0.99 + 1
-        n999 = (count - 1) * 0.999 + 1
-
-        avg = sum(latency_set) / count
-        p50 = latency_set[int(n50) - 1]
-        p90 = latency_set[int(n90) - 1]
-        p95 = latency_set[int(n95) - 1]
-        p99 = latency_set[int(n99) - 1]
-        p999 = latency_set[int(n999) - 1]
-
-        print(f"====== latency stats {title} ======")
-        print("\tAvg Latency: {0:8.2f} ms".format(avg * 1000))
-        print("\tP50 Latency: {0:8.2f} ms".format(p50 * 1000))
-        print("\tP90 Latency: {0:8.2f} ms".format(p90 * 1000))
-        print("\tP95 Latency: {0:8.2f} ms".format(p95 * 1000))
-        print("\tP99 Latency: {0:8.2f} ms".format(p99 * 1000))
-        print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
-
-
-deepspeed.init_distributed()
-
-if args.local_rank == 0:
-    print("BENCHMARK SETTINGS:")
-    print(f"\tMODEL: {args.model}")
-    print(f"\tMAX_TOKENS: {args.max_tokens}")
-    print(f"\tDTYPE: {args.dtype}")
-    print(f"\tCUDA_GRAPHS: {args.graphs}")
-    print(f"\tKERNEL_INJECT: {args.kernel_inject}")
-
-if args.dtype == "int8":
-    dtype = torch.int8
-elif args.dtype == "fp16":
-    dtype = torch.float16
-else:
-    dtype = torch.float32
-
-pipe = pipeline("text-generation",
-                model=args.model,
-                framework="pt",
-                device=args.local_rank)
-
-if dtype == torch.float16:
-    pipe.model.half()
-
-if args.deepspeed:
-    pipe.model = deepspeed.init_inference(
-        pipe.model,
-        dtype=dtype,
-        mp_size=args.world_size,
-        replace_with_kernel_inject=args.kernel_inject,
-        enable_cuda_graph=args.graphs,
-    )
-    pipe.model.profile_model_time()
-
-responses = []
-times = []
-mtimes = []
-for i in range(args.trials):
-    get_accelerator().synchronize()
-    start = time.time()
-    r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
-    get_accelerator().synchronize()
-    end = time.time()
-    responses.append(r)
-    times.append(end - start)  # / (args.max_tokens - 3))
-    mtimes.append(sum(pipe.model.model_times()))
-
-if args.local_rank == 0:
-    print_latency(times, "(e2e) latency")
-    print_latency(mtimes, "(model-only) latency")
-    print_latency(map(lambda t: t / (args.max_tokens - 3),
-                      times),
-                  "(e2e) per token latency")
-    print(f"RESPONSE 0:")
-    print("-" * 30)
-    print(responses[0][0]["generated_text"])
-    print("-" * 30)
diff --git a/benchmarks/inference/requirements.txt b/benchmarks/inference/requirements.txt
deleted file mode 100644
index 00899dd5f4858229e4115fd2b80b7807636892bd..0000000000000000000000000000000000000000
--- a/benchmarks/inference/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-transformers>=4.21.3
diff --git a/benchmarks/inference/run_model.sh b/benchmarks/inference/run_model.sh
deleted file mode 100644
index 8e5fe3ac0133150a5de05f76da4951a2ead6be58..0000000000000000000000000000000000000000
--- a/benchmarks/inference/run_model.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-set -x
-
-model=$1
-branch1=$2
-branch2=$3
-dtype=$4
-graphs=$5
-kernel=$6
-gpus=$7
-
-version=0
-log_path=results/${model}_${dtype}_${graphs}_${kernel}_${gpus}gpus_v${version}
-mkdir -p ${log_path}
-
-params="--dtype $dtype "
-if [[ "$graphs" == "true" ]]; then
-    params+="--graphs "
-fi
-if [[ "$kernel" == "true" ]]; then
-    params+="--kernel "
-fi
-
-echo "baseline $log_path"
-deepspeed --num_gpus 1 gpt-bench.py -m "${model}" $params &> ${log_path}/baseline.log
-
-cd ../../
-git checkout ${branch1}
-cd -
-echo "ds ${branch1} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params &> ${log_path}/ds-${branch1}.log
-
-cd ../../
-git checkout ${branch2}
-cd -
-echo "ds ${branch2} $log_path"
-deepspeed --num_gpus $gpus gpt-bench.py --deepspeed -m "${model}" $params&> ${log_path}/ds-${branch2}.log
diff --git a/benchmarks/inference/sweep.sh b/benchmarks/inference/sweep.sh
deleted file mode 100644
index aabcb0bfdbd89e2eedd97d3e6afa74c0e50e7803..0000000000000000000000000000000000000000
--- a/benchmarks/inference/sweep.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-set -x
-
-export TRANSFORMERS_CACHE=/tmp/hf-cache
-
-branch1=$1
-branch2=$2
-
-gptneo_models="EleutherAI/gpt-neo-2.7B EleutherAI/gpt-neo-1.3B EleutherAI/gpt-neo-125M"
-gpt2_models="gpt2 gpt2-large gpt2-xl"
-gptj_models="EleutherAI/gpt-j-6B"
-opt_models="facebook/opt-125m facebook/opt-1.3b facebook/opt-2.7b facebook/opt-6.7b facebook/opt-13b"
-bloom_models="bigscience/bloom-560m bigscience/bloom-1b7 bigscience/bloom-3b bigscience/bloom-7b1"
-
-for gpus in `echo "1 2 4 8"`; do
-    for dtype in `echo "fp16 fp32"`; do
-        for graphs in `echo "true false"`; do
-            for kernel in `echo "true false"`; do
-                params="$dtype $graphs $kernel $gpus"
-                for m in `echo "$gptneo_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$gpt2_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$gptj_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$opt_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-
-                for m in `echo "$bloom_models"`; do
-                  bash run_model.sh $m $branch1 $branch2 $params
-                done
-            done
-        done
-    done
-done
diff --git a/bin/ds_elastic b/bin/ds_elastic
index c9987d4565da3cb4c7e32b8342c201ba0165e030..1c78aea888946756d2f586f7be604c8140b5d09c 100755
--- a/bin/ds_elastic
+++ b/bin/ds_elastic
@@ -9,11 +9,7 @@ from deepspeed.elasticity import compute_elastic_config
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
-    parser.add_argument('-w',
-                        '--world-size',
-                        type=int,
-                        default=0,
-                        help="Intended/current world size")
+    parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size")
     args = parser.parse_args()
     ds_config = json.load(open(args.config, 'r'))
 
@@ -26,7 +22,9 @@ if __name__ == '__main__':
     print(json.dumps(elastic_config, indent=4, sort_keys=True))
 
     if args.world_size > 0:
-        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config,
+                                                                                target_deepspeed_version=ds_version,
+                                                                                world_size=args.world_size)
         print('------------------------------------------')
         print(f"Calculated results for world size {args.world_size}:")
         print('------------------------------------------')
diff --git a/blogs/README.md b/blogs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c5c72fd4efb021390295788a3a19cf4710d9d68c
--- /dev/null
+++ b/blogs/README.md
@@ -0,0 +1 @@
+All DeepSpeed blogs are linked here:
diff --git a/blogs/assets/images/Throughputs-OPT.png b/blogs/assets/images/Throughputs-OPT.png
new file mode 100755
index 0000000000000000000000000000000000000000..9b968319f759d161ca6bbee7cc3524f58fcce933
Binary files /dev/null and b/blogs/assets/images/Throughputs-OPT.png differ
diff --git a/blogs/assets/images/ds-chat-overview.png b/blogs/assets/images/ds-chat-overview.png
new file mode 100755
index 0000000000000000000000000000000000000000..479ca4bb2ad72319dc293340ca9dce9bf59085c8
Binary files /dev/null and b/blogs/assets/images/ds-chat-overview.png differ
diff --git a/blogs/assets/images/figure3.png b/blogs/assets/images/figure3.png
new file mode 100755
index 0000000000000000000000000000000000000000..44b8e67bd6b950ad072443b7969c82aed90267f6
Binary files /dev/null and b/blogs/assets/images/figure3.png differ
diff --git a/blogs/assets/images/figure4.png b/blogs/assets/images/figure4.png
new file mode 100755
index 0000000000000000000000000000000000000000..dca56637049cd80759bc39798bb8fa9cbf1d0921
Binary files /dev/null and b/blogs/assets/images/figure4.png differ
diff --git a/blogs/assets/images/figure5.png b/blogs/assets/images/figure5.png
new file mode 100755
index 0000000000000000000000000000000000000000..6282c0d19ed1841bcaf1b86e335db806bdcf484c
Binary files /dev/null and b/blogs/assets/images/figure5.png differ
diff --git a/blogs/assets/images/figure6.png b/blogs/assets/images/figure6.png
new file mode 100755
index 0000000000000000000000000000000000000000..8e60773b5709bd39ac400e1a1c68f827207e48d8
Binary files /dev/null and b/blogs/assets/images/figure6.png differ
diff --git a/blogs/assets/images/figure7.png b/blogs/assets/images/figure7.png
new file mode 100755
index 0000000000000000000000000000000000000000..9b400b11efb0d38c3d3241ac9a8615934a9bb7eb
Binary files /dev/null and b/blogs/assets/images/figure7.png differ
diff --git a/blogs/assets/images/hero-figure-black-ja.png b/blogs/assets/images/hero-figure-black-ja.png
new file mode 100644
index 0000000000000000000000000000000000000000..5c6cf05d3e7a2d47c2d0dfb7ac25653024bcf8b4
Binary files /dev/null and b/blogs/assets/images/hero-figure-black-ja.png differ
diff --git a/blogs/assets/images/hero-figure-black.png b/blogs/assets/images/hero-figure-black.png
new file mode 100755
index 0000000000000000000000000000000000000000..6a05f4b27bb9532aed6eae373488aa67825bc069
Binary files /dev/null and b/blogs/assets/images/hero-figure-black.png differ
diff --git a/blogs/assets/images/hybrid-engine.png b/blogs/assets/images/hybrid-engine.png
new file mode 100755
index 0000000000000000000000000000000000000000..5e501108a5b7a29b137ec231f5bf83c51b3f0480
Binary files /dev/null and b/blogs/assets/images/hybrid-engine.png differ
diff --git a/blogs/assets/images/mascot.png b/blogs/assets/images/mascot.png
new file mode 100755
index 0000000000000000000000000000000000000000..e9f7a354fc85ede378c7007692fc5ae6e0258636
Binary files /dev/null and b/blogs/assets/images/mascot.png differ
diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..de3b886e88d7961d1ff95533da07b8777b3daf7b
--- /dev/null
+++ b/blogs/deepspeed-chat/README.md
@@ -0,0 +1,321 @@
+<div align="center">
+
+# DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/hero-figure-black.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+# 1. Overview
+
+ChatGPT like models have taken the AI world by a storm, and it would not be an overstatement to say that its impact on the digital world has been revolutionary. These models are incredibly versatile, capable of performing tasks like summarization, coding, and translation with results that are on-par or even exceeding the capabilities of human experts. Given the sheer power of these models, multiple efforts are underway in the AI open-source community to make ChatGPT-style models more accessible (e.g. ChatLLaMa, Alpaca, Vicuna, Databricks-Dolly, etc.).
+
+Despite these incredible efforts, there is still a lack of an end-to-end RLHF pipeline capable of training powerful ChatGPT like model that is easily accessible to the AI community. For instance, training a modest 6.7B ChatGPT model with existing systems typically requires expensive multi-GPU setup that is beyond the reach of many data scientists. Even with access to such computing resources, training efficiency is often less than 5% of what these machines are capable of (as illustrated [shortly](#effective-throughput-and-scalability-analysis)). And finally, existing solutions simply cannot support easy, fast and affordable training state-of-the-art ChatGPT models with hundreds of billions of parameters, even given access to multi-GPU clusters.
+
+These limitations stem from a lack of a robust system design that is capable of effectively supporting the complex InstructGPT’s RLHF training pipeline that is quite different from the standard pre-training and fine-tuning pipelines that existing DL systems are designed for. Therefore, in the spirit of democratizing ChatGPT-like models, and making RLHF training truly accessible to the AI community, today we are releasing DeepSpeed-Chat with the following three capabilities:
+
+(i) ***Easy-to-use Training and Inference Experience for ChatGPT Like Models***: A single script capable of taking a pre-trained Huggingface model, running it through all three steps of InstructGPT training using DeepSpeed-RLHF system and producing your very own ChatGPT like model. In addition, we provide an inference API for testing conversation-style interactions after the model is trained.
+
+(ii) ***DeepSpeed-RLHF Pipeline***: DeepSpeed-RLHF pipeline primarily replicates the training pipeline from the InstructGPT paper with careful attention to ensure completeness and one-to-one correspondence with the three-steps that includes a) Supervised Fine-tuning (SFT), b) Reward Model Fine-tuning and c) Reinforcement Learning with Human Feedback (RLHF). Additionally, we offer data abstraction and blending capabilities to enable training with multiple data sources.
+
+(iii) ***DeepSpeed-RLHF System***: A robust and sophisticated RLHF system that combines the training and inference prowess of DeepSpeed into single unified Hybrid Engine (DeepSpeed-HE) for RLHF. The Hybrid-Engine is capable of seamlessly transitioning between inference and training modes within RLHF, allowing it to leverage various optimizations from DeepSpeed-Inference such as tensor-parallelism and high-performance transformer kernels for generation, while also benefiting from the multitude of ZeRO- and LoRA-based memory optimization strategies for RL training. DeepSpeed-HE is also aware of the full RLHF pipeline, allowing it to make optimal decisions in terms of memory management and data movement across different phases of RLHF.
+
+
+DeepSpeed-RLHF system is capable of unparalleled efficiency at scale, making complex RLHF training fast, affordable, and easily accessible to the AI community:
+
+***Efficiency and Affordability***: In terms of efficiency, [DeepSpeed-HE is over 15x faster than existing systems](#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), making RLHF training both fast and affordable. For instance, DeepSpeed-HE can train an OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively.
+
+
+<div align="center">
+
+| GPUs        | OPT-6.7B |  OPT-13B     |     OPT-30B     |     OPT-66B    |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB     | 5.7 hours | 10.8 hours |	 1.85 days |	 NA |
+| 8x A100-80GB     | 4.1 hours ($132) | 	9 hours ($290) | 	18 hours ($580) | 	 2.1 days ($1620) |
+
+*Table 1. Single-Node 8x A100: Training Time and Corresponding Approximate Cost on Azure.*
+
+</div>
+
+
+***Excellent Scalability***: DeepSpeed-HE supports models with hundreds of billions of parameters and can achieve excellent scalability on multi-node multi-GPU systems. As a result, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained with DeepSpeed-HE in under a day.
+
+
+<div align="center">
+
+| GPUs	        | OPT-13B 	    | OPT-30B	      | OPT-66B	      | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G	| 1.25 hours ($320)	| 4 hours ($1024) | 7.5 hours ($1920)	| 20 hours ($5120)|
+
+*Table 2. Multi-Node 64x A100-80GB: Training Time and Corresponding Approximate Cost on Azure.*
+</div>
+
+> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details.
+
+
+***Democratizing RLHF Training***: With just a single GPU, DeepSpeed-HE supports training models with over 13 billion parameters, enabling data scientists without access to multi-GPU systems to create not just toy RLHF models but large and powerful ones that can be used in real-world scenarios.
+
+<div align="center">
+
+|            |	V100 32G | 	A6000 48G | A100 40G | A100 80G  |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size |	OPT-2.7B | 	OPT-6.7B  | OPT-6.7B | OPT-13B |
+
+Table 3. Max Model Size Supported by DeepSpeed-HE on a Single GPU
+
+</div>
+
+Next, we dive deeper into the three capabilities of DeepSpeed-Chat introduced above.
+
+# 2. Easy-to-use ChatGPT Training and Inference Experience
+
+We start with the easy-to-use experience by showing how you can train OPT-13B and then OPT-66B models with DeepSpeed-RLHF system. If you are short on time, you can even train an OPT-1.3B model on a single consumer-grade GPU in just two hours.  We also demonstrate how you can use our DeepSpeed-chat RLHF API to develop your own custom pipelines.
+
+## Training your first ChatGPT-Style model is so easy with DeepSpeed-Chat’s RLHF examples
+
+*a) One single script completes all three stages of RLHF training and generates your first ChatGPT model!*
+
+We use an example of pretrained OPT-13B as the actor model and OPT-350M as the reward model in the following single script to generate a final 13B ChatGPT-style model:
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+In about half a day, your 13B model would be fully trained with its checkpoints ready.  The following table demonstrates a breakdown of the training time for each of the three steps:
+
+<div align="center">
+
+| Model Sizes                      | Step 1 | Step 2 | Step 3 | Total  |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr	| 0.25hr | 10.8hr | 13.6hr |
+
+Table 4. E2E time breakdown for training a 13 billion parameter ChatGPT model via DeepSpeed-Chat on a single DGX node with 8 NVIDIA A100-40G GPUs.
+
+</div>
+
+*b) Test your final model with DeepSpeed-Chat’s Inference API*
+
+Now it is time to plug-in and test your final model trained by DeepSpeed-Chat! You can play with not only Q&A sessions but also conversation-style interactions.
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## Want to try different model sizes and configurations? You got it!
+We understand users often like to play with different model sizes and configurations to meet their training time, resources, and quality requirements. With DeepSpeed-Chat, users can easily do that. For example, if you want to train a larger and higher-quality model on your GPU cluster for your research or business, you can simply use the same script with your desired model size e.g., 66B and GPU counts e.g., 64 GPUs:
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+Within 9 hours, you can have your 66 billion parameters ChatGPT model ready to be served in your favorite front-end GUI:
+
+
+<div align="center">
+
+| Model Sizes                       | Step 1  | Step 2 | Step 3 | Total  |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr  | 9hr    |
+
+Table 5. E2E time breakdown for training a 66 billion parameter ChatGPT model via DeepSpeed-Chat on 8 DGX nodes with 8 NVIDIA A100-80G GPUs/node.
+
+</div>
+
+If you only have around 1-2 hours for coffee or lunch break, you can also try to train a small/toy model with DeepSpeed-Chat. For example, we prepared a training example for a 1.3B model with a single dataset to test our framework on your consumer-grade GPUs. The best part is that you will have your model checkpoint ready to play with when you are back from your lunch break!
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| Model Sizes                      | Step 1    | Step 2   | Step 3 | Total  |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*Table 6. E2E time breakdown for training a 1.3 billion parameter ChatGPT model via DeepSpeed-Chat on a single commodity NVIDIA A6000 GPU with 48GB memory.*
+
+</div>
+
+## Customizing your own RLHF training pipeline using DeepSpeed-Chat’s RLHF APIs
+DeepSpeed-Chat allows users to build their very own RLHF training pipeline using our flexible APIs shown below, which users can use to reconstruct their own RLHF training strategy. This enables a general interface and backend for creating a wide range of RLHF algorithms for research exploration.
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. Full-fledged RLHF Training Pipeline
+
+To provide a seamless training experience, we follow InstructGPT and include a full-fledged end-to-end training pipeline in DeepSpeed-Chat as shown in Figure 1.
+
+<div align="center">
+
+<img src="../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*Figure 1: The illustration of DeepSpeed-Chat’s RLHF training pipeline with optional features.*
+
+</div>
+
+Our pipeline includes three main steps:
+
+*	**Step 1: Supervised finetuning (SFT)**, where human responses to various queries are carefully selected to finetune the pretrained language models.
+*	**Step 2: Reward model finetuning**, where a separate (usually smaller than the SFT) model (RW) is trained with a dataset that has human-provided rankings of multiple answers to the same query.
+*	**Step 3: RLHF training**, where the SFT model is further finetuned with the reward feedback from the RW model using the Proximal Policy Optimization (PPO) algorithm.
+
+We provide two additional features in Step 3 to help improve model quality:
+
+*	**Exponential Moving Average (EMA) collection**, where an EMA based checkpoint can be chosen for the final evaluation.
+*	**Mixture Training**, which mixes the pretraining objective (i.e., the next word prediction) with the PPO objective to prevent regression performance on public benchmarks like SQuAD2.0.
+
+The two training features, EMA and Mixed Training, are often omitted by other recent efforts since they can be optional. However, according to InstructGPT, EMA checkpoints generally provide better response quality than conventional final trained model and Mixture Training can help the model retain the pre-training benchmark solving ability. As such, we provide them for users to fully get the training experience as described in InstructGPT and strike for higher model quality.
+
+In addition to being highly consistent with InstructGPT paper, we also provide convenient features to support researchers and practitioners to train their own RLHF model with multiple data resources:
+
+*	***Data Abstraction and Blending Capabilities:*** DeepSpeed-Chat is able to train the model with multiple datasets for better model quality. It is equipped with (1) an abstract dataset layer to unify the format of different datasets; and (2) data splitting/blending capabilities so that the multiple datasets are properly blended then split across the 3 training stages.
+
+To illustrate the effectiveness of our training pipeline, we demonstrate the model quality with multi-round conversation as shown in the experience section.
+
+
+# 4. DeepSpeed Hybrid Engine – Unified Infrastructure to Power and Optimize RLHF Training
+
+Step 1 and Step 2 of the instruct-guided RLHF pipeline resemble regular fine-tuning of large models, and they are powered by ZeRO-based optimizations and flexible combination of parallelism strategies in DeepSpeed training to achieve scale and speed.  Step 3 of the pipeline, on the other hand, is the most complex part to handle in terms of performance implications. Each iteration requires efficient processing of two phases a) inference phase for token/experience generation, producing inputs for the training and b) training phase to update the weights of actor and reward models, as well as the interaction and scheduling between them.  It introduces two major costs: (1) the memory cost, as several copies of the SFT and RW models need to be served throughout stage 3; and (2) the predominant generation phase, which if not accelerated properly, will significantly slow down the entire stage 3. Additionally, the two important features we added in Stage 3, including Exponential Moving Average (EMA) collection and Mixture Training, will incur additional memory and training costs.
+
+To tackle these challenges, we composed the full system capability of DeepSpeed Training and Inference into a unified infrastructure that we call **Hybrid Engine**. It leverages the original DeepSpeed engines for fast training mode while effortlessly applying DeepSpeed inference engine for generation/evaluation mode, providing a significantly faster training system for RLHF training at Stage 3. As Figure 2 shows, the transition between DeepSpeed training and inference engine is seamless: by having the typical eval and train modes enabled for the actor model, when running for inference and training pipeline, DeepSpeed selects its different optimizations to run the model faster and improve the overall system throughput.
+
+
+<div align="center">
+
+<img src="../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*Figure 2. DeepSpeed Hybrid Engine design for accelerating the most time-consuming portion of a RLHF pipeline.*
+
+</div>
+
+During its inference execution for experience generation phase of RLHF training, DeepSpeed Hybrid Engine uses a light-weight memory management system to handle the KV-cache and intermediate results, together with highly optimized inference-adapted kernels and tensor parallelism implementation, to achieve significant boost in throughput (tokens-per-second) compared to the existing solutions.
+
+During the training execution, Hybrid Engine enables memory optimization techniques such as DeepSpeed’s ZeRO family of technologies and Low Rank Adaption (LoRA).   We designed and implemented these system optimizations in a way that they are compatible with each other and can be composed together to deliver the highest training efficiency under the unified Hybrid Engine.
+
+Hybrid Engine can seamlessly change model partitioning across training and inference to support tensor-parallelism based inferencing and ZeRO-based sharding mechanism for training. It can also reconfigure the memory system to maximize memory availability during each of these modes. This allows for improved performance by avoiding memory allocation bottlenecks and supporting large batch sizes. Packed with a spectrum of system technologies from DeepSpeed training and inference, Hybrid Engine pushes the boundary of modern RLHF training and delivers unparalleled scale and system efficiency for RLHF workloads.
+
+
+# 5. DeepSpeed RLHF: Unparalleled Scale and Efficiency via Hybrid Engine
+
+## Capability Recap
+
+As discussed, DeepSpeed-HE is an amalgamation of powerful system technologies for inference and training, architected to achieve excellent scale and efficiency for DeepSpeed-RLHF pipeline across a wide range of hardware, making RLHF training fast, affordable, and easily accessible to AI community.
+
+In terms of efficiency and affordability, as shown in Table 1, DeepSpeed-HE can train OPT-13B in just 9 hours and OPT-30B in 18 hours on Azure Cloud for under $300 and $600, respectively. In terms of speed and scalability, as shown in Table 2, even a 13B model can be trained in 1.25 hours and a massive 175B model can be trained in under a day using a 64 GPU cluster. And in terms of accessibility and democratization of RLHF, DeepSpeed-HE supports training models with over 13 billion parameters on a single GPU as shown in Table 3.
+
+## Throughput and Model Size Scalability Comparisons with Existing RLHF Systems
+
+Compared to other RLHF systems like Colossal-AI or HuggingFace powered by native PyTorch, DeepSpeed-RLHF excels in system performance and model scalability:
+
+*	With respect to throughput, DeepSpeed enables over 10x improvement for RLHF training on a single GPU (Figure 3). On multi-GPU setup, it enables  6 – 19x  speedup over Colossal-AI and 1.4 – 10.5x over HuggingFace DDP (Figure 4).
+*	With respect to model scalability, Colossal-AI can run a max model size of 1.3B on a single GPU and 6.7B on a single A100 40G node, DeepSpeed-HE can run 6.5B and 50B models respectively on the same hardware, up to 7.5x larger.
+
+Therefore, with over an order of magnitude higher throughput, DeepSpeed-HE unlocks the ability to train significantly larger actor models under the same latency budget or train models of similar size at over 10x lower cost, compared to existing RLHF systems like Colossal-AI or HuggingFace DDP.
+
+<div align="center">
+
+<img src="../assets/images/figure3.png" width="600px" />
+
+*Figure 3. Step 3 throughput comparison against two other system frameworks for accelerating RLHF \
+training on a single NVIDIA A100-40G commodity GPU.  No icons represent OOM scenarios.*
+
+</div>
+
+<div align="center">
+
+<img src="../assets/images/figure4.png" width="600px" />
+
+*Figure 4. End-to-end training throughput comparison for step 3 of the training pipeline (the most time \
+consuming portion) with different model sizes on a single DGX node equipped with 8 NVIDIA A100-40G GPUs.\
+No icons represent OOM scenarios.*
+
+</div>
+
+This improvement in efficiency stems from DeepSpeed-HE’s ability to accelerate RLHF generation phase of the RLHF processing leveraging DeepSpeed inference optimizations. Figure 5 shows the time breakdown for a 1.3B parameter model at an RLHF training iteration: majority of the time goes to the generation phase. By leveraging high performance inference kernels from DeepSpeed, DeepSpeed-HE can achieve up to 9x throughput improvement during this phase over HuggingFace and 15x over Colossal-AI allowing it to achieve unparallel end-to-end efficiency.
+
+<div align="center">
+
+<img src="../assets/images/figure5.png" width="600px" />
+
+*Figure 5. Superior generation phase acceleration from DeepSpeed Chat’s Hybrid Engine: A time/sequence breakdown for training OPT-1.3B actor model + OPT-350M reward model on a single DGX node with 8 A100-40G GPUs.*
+
+</div>
+
+## Effective Throughput and Scalability Analysis
+
+***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
+
+<div align="center">
+
+<img src="../assets/images/Throughputs-OPT.png" width="600px" />
+
+*Figure 6. RLHF Generation, training, and effective throughput with DeepSpeed-HE for different model sizes, at the GPU count that maximizes efficiency.*
+
+</div>
+
+To maximize the effective throughput, DeepSpeed-HE optimizes both phases. First, it uses the largest batch size possible to get higher efficiency on both phases. Second, during the generation phase, it leverages high-performance transformer kernels to maximize GPU memory bandwidth utilization when the model fits in single GPU memory, and leverage tensor-parallelism (TP) when it does not. Using TP in the generation phase instead of ZeRO to fit the model reduces the inter-GPU communication and maintains high GPU memory bandwidth utilization.
+
+Figure 6 shows the best achievable effective throughput for DeepSpeed-HE in terms of TFlops/GPU for model sizes ranging from 1.3B to 175B. It also shows the throughput achieved by each of the generation and training phases. DeepSpeed-HE is the most efficient for models in the range 6.7B-66B. Going beyond this range to 175B, the throughput drops due to the limited memory to support larger batch sizes, while still achieving 1.2x better efficiency than the small 1.3B model. The per-GPU throughput of these gigantic models could improve further when we scale them to more GPUs with more memory available for larger batch sizes.
+
+Furthermore, we would like to point out that our effective performance is 19x higher than existing systems, as shown in Figure 4, which suggests that they are operating at lower than 5% of the peak. This demonstrates the challenge of optimizing RLHF workloads as well as the effectiveness of our system despite the challenge.
+
+<div align="center">
+
+<img src="../assets/images/figure7.png" width="600px" />
+
+*Figure 7. Scalability for training 13B (left) and 66B (right) actor model+350M reward model on an increasing number of DGX nodes with 8 A100-40/80G GPUs*
+
+</div>
+
+***(II) Scalability Analysis.*** The best effective throughput for different model sizes is achieved at different GPU count. This is in part because some of the larger model sizes require more memory to run. However, a large part of this behavior stems from DeepSpeed-HE’s scalability properties that we discuss next.
+
+Figure 7 shows that DeepSeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to interaction between memory availability and max global batch size.
+
+As DeepSpeed-HE is powered by ZeRO-based technology for training, it allows model states to be partitioned across the available GPUs. As a result, the memory consumption per GPU reduces with the increase in the number of GPUs, allowing DeepSpeed-HE to support a larger batch per GPU resulting in super-linear scaling. However, at large scale, while the available memory continues to increase, the maximum global batch size (1024, in our case, with a sequence length of 512) limits the batch size per GPU, resulting in near-linear or sub-linear scaling.
+As a result, for a given max global batch size, DeepSpeed-HE achieves the best throughput and cost efficiency at the boundary of super-linear and sub-linear scalability, and the exact point is mostly determined by the largest batch size that can be run per GPU as the function of available memory and global batch size.
+
+# 6. Release: Try DeepSpeed Chat Today!
+
+We are very excited to share that DeepSpeed-Chat is now open-sourced and available to the AI community.
+
+* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future.
+
+DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising of a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed) and [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP) for latest news on DeepSpeed.
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2e648ed3628bdccbafe6a2c18d7ed75fd89e7d9
--- /dev/null
+++ b/blogs/deepspeed-chat/chinese/README.md
@@ -0,0 +1,314 @@
+<div align="center">
+
+# DeepSpeed Chat: 一键式RLHF训练，让你的类ChatGPT千亿大模型提速省钱15倍
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/hero-figure-black.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+# 1. 概述
+
+近日来，ChatGPT及类似模型引发了人工智能（AI）领域的一场风潮。 这场风潮对数字世界产生了革命性影响。ChatGPT类模型具有惊人的泛用性，能够执行归纳、编程、翻译等任务，其结果与人类专家相当甚至更优。为了使ChatGPT等模型的训练和部署更轻松，AI 开源社区进行了各种尝试（例如 ChatLLaMa、Alpaca、Vicuna、Databricks-Dolly等）。
+
+然而，尽管开源社区付出了巨大的努力，目前仍缺乏一个支持端到端的基于人工反馈机制的强化学习（RLHF）的规模化系统，这使得训练强大的类ChatGPT模型十分困难。例如，使用现有的开源系统训练一个具有 67 亿参数的类ChatGPT模型通常需要昂贵的多卡至多节点的 GPU 集群，但这些资源对大多数数据科学家或研究者而言难以获取。同时，即使有了这样的计算资源，[现有的开源系统的训练效率通常还不到这些机器所能达到的最大效率的5%](#有效吞吐量和可扩展性分析)。简而言之，即使有了昂贵的多GPU集群，现有解决方案也无法轻松、快速、经济的训练具有数千亿参数的最先进的类ChatGPT模型。
+
+ChatGPT模型的训练是基于InstructGPT论文中的RLHF方式。这与常见的大语言模型的预训练和微调截然不同。这使得现有深度学习系统在训练类ChatGPT模型时存在种种局限。因此，为了让ChatGPT类型的模型更容易被普通数据科学家和研究者使用，并使RLHF训练真正普及到AI社区，我们发布了 DeepSpeed-Chat。DeepSpeed-Chat具有以下三大核心功能：
+
+
+（i）***简化 ChatGPT 类型模型的训练和强化推理体验***：只需一个脚本即可实现多个训练步骤，包括使用 Huggingface 预训练的模型、使用 DeepSpeed-RLHF 系统运行 InstructGPT 训练的所有三个步骤、甚至生成你自己的类ChatGPT模型。此外，我们还提供了一个易于使用的推理API，用于用户在模型训练后测试对话式交互。
+
+（ii）***DeepSpeed-RLHF 模块***：DeepSpeed-RLHF 复刻了 InstructGPT 论文中的训练模式，并确保包括a) 监督微调（SFT），b) 奖励模型微调和 c) 基于人类反馈的强化学习（RLHF）在内的三个步骤与其一一对应。此外，我们还提供了数据抽象和混合功能，以支持用户使用多个不同来源的数据源进行训练。
+
+（iii）***DeepSpeed-RLHF 系统***：我们将 DeepSpeed 的训练（training engine）和推理能力（inference engine) 整合到一个统一的混合引擎（DeepSpeed Hybrid Engine or DeepSpeed-HE）中用于 RLHF 训练。DeepSpeed-HE 能够在 RLHF 中无缝地在推理和训练模式之间切换，使其能够利用来自 DeepSpeed-Inference 的各种优化，如张量并行计算和高性能CUDA算子进行语言生成，同时对训练部分还能从 ZeRO- 和 LoRA-based 内存优化策略中受益。DeepSpeed-HE 还能够自动在 RLHF 的不同阶段进行智能的内存管理和数据缓存。
+
+
+DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率，使复杂的 RLHF 训练变得快速、经济并且易于大规模推广：
+
+**高效性和经济性**：[DeepSpeed-HE 比现有系统快 15 倍以上](#与现有-RLHF-系统的吞吐量和模型大小可扩展性比较)，使 RLHF 训练快速且经济实惠。例如，DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个 OPT-13B模型，只需 18 小时即可训练一个 OPT-30B模型。这两种训练分别花费不到 300 美元和 600 美元。
+
+<div align="center">
+
+| GPUs        | OPT-6.7B |  OPT-13B     |     OPT-30B     |     OPT-66B    |
+|-------------|:--------:|:--------------:|:-------------:|:-----------:|
+| 8x A100-40GB     | 5.7 hours | 10.8 hours |	 1.85 days |	 NA |
+| 8x A100-80GB     | 4.1 hours ($132) | 	9 hours ($290) | 	18 hours ($580) | 	 2.1 days ($1620) |
+
+*表 1. 单节点 8x A100：训练时长及预估的 Azure 费用。*
+
+</div>
+
+***卓越的扩展性***：DeepSpeed-HE 能够支持训练拥有数千亿参数的模型，并在多节点多 GPU 系统上展现出卓越的扩展性。因此，即使是一个拥有 130 亿参数的模型，也只需 1.25 小时就能完成训练。而对于庞大的 拥有1750 亿参数的模型，使用 DeepSpeed-HE 进行训练也只需不到一天的时间。
+
+<div align="center">
+
+| GPUs	        | OPT-13B 	    | OPT-30B	      | OPT-66B	      | OPT-175B |
+|---------------|:-----------------:|:---------------:|:-------------:|:-------------:|
+| 64x A100-80G	| 1.25 hours ($320)	| 4 hours ($1024) | 7.5 hours ($1920)	| 20 hours ($5120)|
+
+*表 2. 多节点 64x A100-80GB：训练时长及预估的 Azure 费用。*
+</div>
+
+> ***非常重要的细节***: 上述两个表格（即表一和表二）中的数据均针对 RLHF 训练的第 3 步，基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿（135M）个字符（token）上进行一个时期（epoch）的训练。我们总共有 6750 万个查询（query）字符（131.9k 个 query，每个序列长度为 256）和 6750 万个生成/回答字符（131.9k 个答案，每个序列长度为 256），每步的最大全局字符批量大小约为 500 万个字符（1024 个查询-答案对）。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前，我们建议读者注意这些设定。想要了解更多详细信息，请参阅我们的页面 [benchmark setting](https://github.com/microsoft/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。
+
+***实现 RLHF 训练的普及化***：仅凭单个 GPU，DeepSpeed-HE 就能支持训练超过 130 亿参数的模型。这使得那些无法使用多 GPU 系统的数据科学家和研究者不仅能够轻松创建轻量级的 RLHF 模型，还能创建大型且功能强大的模型，以应对不同的使用场景。
+
+<div align="center">
+
+|            |	V100 32G | 	A6000 48G | A100 40G | A100 80G  |
+|------------|:---------:|:----------:|:--------:|:---------:|
+| Model Size |	OPT-2.7B | 	OPT-6.7B  | OPT-6.7B | OPT-13B |
+
+*表 3. DeepSpeed-HE 在不同的 GPU 单卡上支持的最大模型。*
+
+</div>
+
+接下来，我们将深入了解上面介绍的 DeepSpeed-Chat 的三个功能。
+
+# 2. 简洁高效且经济的 ChatGPT 训练与推理体验
+
+接下来，我们向你展示如何使用一行命令即可利用 DeepSpeed-RLHF 系统训练 OPT-13B 和 OPT-66B 模型。即使时间有限，你也可以在大约两小时内在单个消费级 GPU 上训练一个 OPT-1.3B 模型。此外，我们还演示了如何利用我们的 DeepSpeed-chat RLHF API 来开发你自己的模型：
+
+## 使用 DeepSpeed-Chat 的 RLHF 示例轻松训练你的第一个 类ChatGPT 模型
+
+*a) 仅需一个脚本，完成 RLHF 训练的全部三个阶段，生成你的第一个 ChatGPT 模型！*
+
+以下示例展示了如何使用一个脚本，以预训练的 OPT-13B 作为 actor 模型，OPT-350M 作为 reward 模型，生成一个最终的 130 亿参数的 ChatGPT 类型的模型：
+```
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+
+大约半天时间，你的 拥有130 亿参数的模型即可完成训练。下表展示了整个过程在三个步骤中所需的时间：
+
+<div align="center">
+
+| Model Sizes                      | Step 1 | Step 2 | Step 3 | Total  |
+|--------------------------------- |:------:|:------:|:------:|:------:|
+| Actor: OPT-13B, Reward: OPT-350M | 2.5hr	| 0.25hr | 10.8hr | 13.6hr |
+
+*表 4. 在单个DGX节点上（8x A100-40G），针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-13b所需的时间。*
+
+</div>
+
+*b) 使用 DeepSpeed-Chat 的推理 API 测试训练好的模型*
+
+将训练好的模型导入提供的简易 API，你不仅可以与模型进行问答互动，还可以进行对话式交流。
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explian it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## 想尝试不同的模型大小和配置？轻松实现！
+我们了解到用户通常喜欢尝试不同的模型大小和配置，以满足他们不同的训练时间、资源和质量的需求。借助 DeepSpeed-Chat，你可以轻松实现这些目标。例如，如果你想在 GPU 集群上训练一个更大、更高质量的模型，用于你的研究或业务，你可以使用相同的脚本，只需输入你期望的模型大小（例如 660 亿参数）和 GPU 数量（例如 64 个 GPU）：
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+在 接下来的9 小时内，你将拥有一个 660 亿参数的 ChatGPT 模型，并可以在你喜欢的前端 GUI 中使用：
+
+<div align="center">
+
+| Model Sizes                       | Step 1  | Step 2 | Step 3 | Total  |
+|---------------------------------- |:-------:|:------:|:------:|:------:|
+| Actor: OPT-66B, Reward: OPT-350M | 82 mins | 5 mins | 7.5hr  | 9hr    |
+
+*表 5. 在8个DGX节点上（64x A100-80G），针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-66b所需的时间。*
+
+</div>
+
+如果你只有大约 1-2 小时的咖啡或午餐休息时间，你也可以尝试使用 DeepSpeed-Chat 训练一个小型模型。例如，我们为单个数据集准备了一个 13 亿参数模型的训练示例，以便在你的消费级 GPU 上测试我们的框架。当你从午餐休息回来时，你的模型已经准备好供你使用！
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| Model Sizes                      | Step 1    | Step 2   | Step 3 | Total  |
+|--------------------------------- |:---------:|:--------:|:------:|:------:|
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 secs | 670 secs | 1.2hr | 2.2hr |
+
+*表 6.  在单个消费级A6000-48G上，针对不同的RLHF步骤， 使用DeepSpeed-Chat训练OPT-1.3b所需的时间。*
+
+</div>
+
+## 利用 DeepSpeed-Chat 的 RLHF API 自定义你自己的 RLHF 训练流程
+DeepSpeed-Chat 允许用户使用我们灵活的 API（如下所示）构建自己的 RLHF 训练流程，用户可以使用这些 API 重建自己的 RLHF 训练策略。我们希望这些功能可以为研究探索中创建各种 RLHF 算法提供通用接口和后端。
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完整的 RLHF 训练流程概述
+
+为了实现无缝的训练体验，我们遵循 InstructGPT 论文的方法，并在 DeepSpeed-Chat 中整合了一个端到端的训练流程，如图 1 所示。
+
+<div align="center">
+
+<img src="../../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*图 1: DeepSpeed-Chat 的 RLHF 训练流程图示，包含了一些可选择的功能。*
+
+</div>
+
+我们的流程包括三个主要步骤：
+
+*	**步骤1：监督微调（SFT）** —— 使用精选的人类回答来微调预训练的语言模型以应对各种查询；
+* **步骤2：奖励模型微调** —— 使用一个包含人类对同一查询的多个答案打分的数据集来训练一个独立的（通常比 SFT 小的）奖励模型（RW）；
+*	**步骤3：RLHF 训练** —— 利用 Proximal Policy Optimization（PPO）算法，根据 RW 模型的奖励反馈进一步微调 SFT 模型。
+
+在步骤3中，我们提供了两个额外的功能，以帮助提高模型质量：
+* **指数移动平均（EMA）** —— 可以选择基于 EMA 的检查点进行最终评估
+*	**混合训练** —— 将预训练目标（即下一个单词预测）与 PPO 目标混合，以防止在像 SQuAD2.0 这样的公开基准测试中的性能损失
+
+这两个训练功能，EMA 和混合训练，常常被其他的开源框架所忽略，因为它们并不会妨碍训练的进行。然而，根据 InstructGPT，EMA 通常比传统的最终训练模型提供更好的响应质量，而混合训练可以帮助模型保持预训练基准解决能力。因此，我们为用户提供这些功能，以便充分获得 InstructGPT 中描述的训练体验，并争取更高的模型质量。
+
+除了与 InstructGPT 论文高度一致外，我们还提供了一项方便的功能，以支持研究人员和从业者使用多个数据资源训练他们自己的 RLHF 模型：
+
+*	**数据抽象和混合能力**： DeepSpeed-Chat 能够使用多个不同来源的数据集训练模型以获得更好的模型质量。它配备了（1）一个抽象数据集层，以统一不同数据集的格式；以及（2）数据拆分/混合功能，以便多个数据集在 3 个训练阶段中被适当地混合然后拆分。
+
+在我们之前的章节中，你可以看到使用整个 DeepSpeed-Chat 训练模型在多轮对话中的表现。
+
+# 4. DeepSpeed Hybrid Engine —— 统一的高效混合引擎，为 RLHF 训练提供动力并进行优化
+
+DeepSpeed-Chat流程的前两步与大型模型的常规微调相似，得益于基于ZeRO的内存管理优化和DeepSpeed训练中的并行策略灵活组合，实现了规模和速度的提升。然而，流程的第三步在性能方面是最具挑战性的部分。每次迭代都需要高效处理两个阶段：a) 生成回答的推理阶段，为训练提供输入；b) 更新 actor 和 reward 模型权重的训练阶段，以及它们之间的交互和调度。这引入了两个主要困难：（1）内存成本，因为在第三阶段的整个过程中需要运行多个SFT和RW模型；（2）生成回答阶段的速度较慢，如果没有正确加速，将显著拖慢整个第三阶段。此外，我们在第三阶段中添加的两个重要可选功能，包括指数移动平均（EMA）收集和混合训练，将产生额外的内存和训练成本。
+
+为了应对这些挑战，我们将DeepSpeed训练和推理的系统功能整合为一个统一的基础设施，称为混合引擎（Hybrid Engine）。它利用原始DeepSpeed引擎进行高速训练模式，同时轻松应用DeepSpeed推理引擎进行生成/评估模式，为第三阶段的RLHF训练提供了一个明显更快的训练系统。如图2所示，DeepSpeed训练和推理引擎之间的过渡是无缝的：通过为actor模型启用典型的eval和train模式，当运行推理和训练流程时，DeepSpeed选择其不同的优化来运行模型更快并提高整个系统吞吐量。
+
+<div align="center">
+
+<img src="../../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*Figure 2. 设计图解：DeepSpeed Hybrid Engine，用于加速 RLHF 流程中最耗时的部分。*
+
+</div>
+
+在RLHF训练的经验生成阶段的推理执行过程中，DeepSpeed混合引擎使用轻量级内存管理系统来处理KV缓存和中间结果，同时使用高度优化的推理CUDA核和张量并行计算。与现有解决方案相比，DeepSpeed-HE显著提高了吞吐量（每秒token数）。
+
+在训练执行过程中，混合引擎使用了多种内存优化技术，如DeepSpeed的ZeRO系列技术和现在流行的LoRA方法。这些技术在混合引擎中可以彼此兼容，并可以组合在一起以提供最高训练效率。
+
+DeepSpeed-HE可以在训练和推理之间无缝更改模型分区，以支持基于张量并行计算的推理和基于ZeRO的分片机制进行训练。它还会重新配置内存系统以在此期间最大化内存可用性。DeepSpeed-HE还通过规避内存分配瓶颈和支持大批量大小来进一步提高性能。混合引擎集成了DeepSpeed训练和推理的一系列系统技术，突破了现有RLHF训练的极限，并为RLHF工作负载提供了无与伦比的规模和系统效率。
+
+# 5. DeepSpeed RLHF: 通过 Hybrid Engine 实现无与伦比的规模和效率
+
+## 回顾
+
+如前所述，DeepSpeed-HE 是一个将强大的用于推理和训练的结合系统，旨在使 DeepSpeed-RLHF 在各种硬件上实现卓越的规模和效率，使 RLHF 训练快速、经济并且易于 AI 社区使用。
+
+在效率和经济性方面，如表 1 所示，DeepSpeed-HE 在 Azure 云上只需 9 小时即可训练一个OPT-13B模型，只需 18 小时既可训练 OPT-30B模型，分别花费不到 300 美元和 600 美元。在速度和可扩展性方面，如表 2 所示，即使是 13B 的模型也可以在 1.25 小时内训练，而庞大的 175B 模型可以在不到一天的时间内使用 64 个 GPU 集群进行训练。在 RLHF 的可访问性和普及化方面，DeepSpeed-HE 可以在单个 GPU 上训练超过 130 亿参数的模型，如表 3 所示。
+
+## 与现有 RLHF 系统的吞吐量和模型大小可扩展性比较
+
+与其他 RLHF 系统（如 Colossal-AI 或由原生 PyTorch 提供支持的 HuggingFace）相比，DeepSpeed-RLHF 在系统性能和模型可扩展性方面表现出色：
+
+* 就吞吐量而言，DeepSpeed 在单个 GPU 上的 RLHF 训练中实现了 10 倍以上的改进（图 3）。在多 GPU 设置中，它比 Colossal-AI 快 6 - 19 倍，比 HuggingFace DDP 快 1.4 - 10.5 倍（图 4）。
+* 就模型可扩展性而言，Colossal-AI 可以在单个 GPU 上运行最大 1.3B 的模型，在单个 A100 40G 节点上运行 6.7B 的模型，而 DeepSpeed-HE 可以在相同的硬件上分别运行 6.5B 和 50B 的模型，实现高达 7.5 倍的提升。
+
+因此，凭借超过一个数量级的更高吞吐量，与现有的 RLHF 系统（如 Colossal-AI 或 HuggingFace DDP）相比，DeepSpeed-HE 拥有在相同时间预算下训练更大的 actor 模型的能力，或者以十分之一的成本训练类似大小的模型的能力。
+
+
+<div align="center">
+
+<img src="../../assets/images/figure3.png" width="600px" />
+
+*图 3. 在单个 NVIDIA A100-40G GPU 上，将 RLHF 训练的吞吐量与另外两个系统框架在步骤 3 进行比较。没有图标表示 OOM（内存不足）的情况*
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/figure4.png" width="600px" />
+
+*图 4. 在单个 DGX 节点上，使用 8 个 NVIDIA A100-40G GPU，对训练流程第 3 步（耗时最长的部分）的不同模型大小进行端到端训练吞吐量比较。没有图标表示 OOM（内存不足）的情况。*
+
+</div>
+
+这种效率的提高是 DeepSpeed-HE 利用 DeepSpeed 推理优化在 RLHF 处理过程中加速 RLHF 生成的结果。图 5 显示了 RLHF 训练迭代中 1.3B 参数模型的时间消耗细节：大部分时间用于生成阶段。通过利用 DeepSpeed 的高性能推理内核，DeepSpeed-HE 在这个阶段可以实现比 HuggingFace 高达 9 倍的吞吐量改进，比 Colossal-AI 高 15 倍，从而实现无与伦比的端到端效率。
+
+<div align="center">
+
+<img src="../../assets/images/figure5.png" width="600px" />
+
+*图 5. DeepSpeed Chat 的混合引擎在生成阶段的优越加速：在单个 DGX 节点上使用 8 个 A100-40G GPU 训练 OPT-1.3B actor 模型 + OPT-350M reward 模型的时间/序列分解。*
+
+</div>
+
+## 有效吞吐量和可扩展性分析
+
+***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段，DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF （详见 [benchmarking setting](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)）中，生成阶段占总计算的约 20%，而 RL 训练阶段占剩余的 80%。然而，尽管比例较小，前者可能会占用大部分的端到端时间，因为它需要为每个生成的字符运行一次 actor 模型，使其受到内存带宽限制，难以实现高吞吐量。相比之下，RL 训练阶段是计算密集型的，仅需运行参考 actor 模型进行几次前向和后向传递，每个样本都有来自提示和生成的全部 512 个字符，可以实现良好的吞吐量。
+
+<div align="center">
+
+<img src="../../assets/images/figure6.png" width="600px" />
+
+*图 6. 在最大效率的情况下，DeepSpeed-HE 针对不同模型大小的RLHF生成、训练和有效吞吐量。*
+
+</div>
+
+为了最大化有效吞吐量，DeepSpeed-HE 对两个阶段进行了优化。首先，它使用尽可能大的批量大小以在两个阶段上获得更高的效率。其次，在生成阶段，它利用高性能CUDA内核在模型在单个 GPU 上最大化 GPU 内存带宽利用率，并在其他情况下利用张量并行（Tensor Parallelism, 简写作TP）进行计算。DeepSpeed-HE进一步在生成阶段使用 TP 而不是 ZeRO 以减少 GPU 之间的通信并保持高 GPU 内存带宽利用率。
+
+图 6 显示了 DeepSpeed-HE 在 1.3B 到 175B 的模型大小范围内可以实现的最佳有效吞吐量（以 TFlops/GPU 表示）。它还分别显示了在生成和训练阶段实现的吞吐量。DeepSpeed-HE 对 6.7B-66B 范围内的模型最为高效。超出这个范围到 175B 时，由于内存有限，无法支持更大的批量大小，吞吐量下降，但仍比小型 1.3B 模型的效率高 1.2 倍。当我们将这些巨大的模型扩展到更多具有更多内存的 GPU 时，这些模型的每个 GPU 吞吐量可能会进一步提高。
+
+此外，我们想指出，如图 2 所示，我们系统的有效性能比现有系统高 19 倍，这表明它们的运行速度低于峰值的 5%。这说明了优化 RLHF 工作负载的挑战以及我们的系统在面对挑战时的有效性。
+
+<div align="center">
+
+<img src="../../assets/images/figure7.png" width="600px" />
+
+*图 7. 在不同数量的DGX (A100-40/80G GPU) 节点上，进行13B（左）和66B（右）actor 模型 和 350M reward 模型的可扩展性训练。*
+
+</div>
+
+***(II) 可扩展性分析。*** 不同模型大小的最佳有效吞吐量取决于不同的 GPU 数量。部分原因是因为一些较大的模型大小需要更多的内存来运行。基于此，我们接下来讨论 DeepSpeed-HE 的可扩展性特性。
+
+图 7 显示 DeepSeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而，如果我们仔细观察，可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展，随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。
+
+DeepSpeed-HE 的核心技术基于 ZeRO，用于训练过程中将模型状态分割到每个GPU上。这意味着随着 GPU 数量的增加，每个 GPU 的内存消耗会减少，使得 DeepSpeed-HE 能够在每个 GPU 上支持更大的批量，从而实现超线性扩展。然而，在大规模情况下，尽管可用内存持续增加，但最大全局批量大小仍然限制了每个 GPU 的批量大小，导致接近线性或次线性扩展。因此，在给定的最大全局批量大小（例如，我们设置为 1024 个句子，每个句子长度为 512）下，DeepSpeed-HE 在超线性和次线性可扩展性之间实现了最佳的吞吐量和成本效益。具体的平衡点主要取决于每个 GPU 上可运行的最大批量大小，而这又受到可用内存和全局批量大小的函数所决定。
+
+# 6. 发布：现在就尝试DeepSpeed Chat吧!
+
+我们非常高兴地宣布，DeepSpeed-Chat现已开源并向 AI 社区开放。
+
+* 如果你发现我们的成果对你有用或者喜欢我们的开源成果，请在 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)上点⭐。
+
+
+* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用：[GitHub 登陆页面](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+
+
+* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。
+
+DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分，包括众多深度学习系统和建模技术。要了解更多信息，
+
+* 请访问我们的[网站](https://www.deepspeed.ai/)，了解详细的博客文章、教程和有用的文档。
+* 你还可以关注我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed) 和[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP)，了解 DeepSpeed 的最新动态。我们还将授权开源社KAIYUANSHE微信公众号第一时间发布我们的中文博客。
+
+
+DeepSpeed 欢迎你的贡献！我们鼓励你在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作，共同开展深度学习研究，将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求（以及其他不适合在 GitHub 上提出的需求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68b7a0046298af06e9b8f36f6961b46a55dbcdc2
--- /dev/null
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -0,0 +1,334 @@
+<div align="center">
+
+# DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/hero-figure-black-ja.png" width="1000px" alt="DeepSpeed-Chat!"/>
+
+</div>
+
+# 1. 概要
+
+ChatGPT（チャットGPT）やその類似モデルは、AIの世界に旋風を巻き起こし、デジタル業界に革命的な影響を与えています。これらのモデルは非常に汎用性が高く、要約、コーディング、翻訳などの多様なタスクを、人間の専門家と同等か、それ以上の結果で実施できます。その圧倒的な性能を受けて、AI関連のオープンソースコミュニティでは、ChatGPTスタイルのモデルをより利用しやすくするための複数の取り組みが始まっています（ChatLLaMa、Alpaca、Vicuna、Databricks-Dollyなど）。
+
+しかし、様々なプロジェクトで多大な努力が投じられたにも関わらず、ChatGPTライクなモデルの訓練で必要となるRLHF（Reinforcement Learning from Human Feedback）を、十分に簡単かつ高い効率で実行できるend-to-endなパイプラインは、これまで存在していませんでした。例えば、6.7BのChatGPTライクなモデルを訓練するには、高価なGPUが多数必要になり、多くのデータサイエンティストにとっては実施が困難でした。また仮にそうした計算資源があったとしても、従来のソフトウェアでは、ハードウェアの5%未満の性能しか引き出せませんでした（[概要](#実効スループットとスケーラビリティ)）。さらには、従来のソフトウェアを用いて、簡単かつ高速に、かつ低コストで、数千億のパラメータを持つ最先端のChatGPTライクなモデルの訓練する方法はありませんでした。
+
+ChatGPTの訓練に用いられるInstructGPTにおいて提案されたRLHFでは、これまでの標準的な事前学習やファインチューニングと全く異なり、はるかに複雑なパイプラインが必要となります。従来のソフトウェアでは、そうしたパイプラインが効果的にサポートする仕組みがありませんでした。そこで、RLHFの訓練を広くAIコミュニティで利用可能とし、ChatGPTのようなモデルを誰もが作成できるにするため、以下の機能を備えたDeepSpeed-Chatをリリースすることになりました。
+
+(i) ***容易に実施可能なChatGPTライクなモデルの訓練と推論***: Huggingfaceレポジトリで提供されている学習済みモデルから開始して、InstructGPT学習の全3ステップを実行し、独自のChatGPTライクなモデルを生成できるスクリプトを提供します。また、学習後の会話形式のインタラクションをテストするための推論APIを提供します。
+
+(ii) ***DeepSpeed-RLHF パイプライン***: DeepSpeed-RLHFパイプラインは、InstructGPTの学習パイプラインの3つのステップ a) 教師付きファインチューニング (Supervised fine-tuning, SFT), b) 報酬モデルのファインチューニング, c) RLHF (Reinforcement Learning with Human Feedback) を、包括的に、かつ1対1の対応を保って再現するものです。また、複数のデータソースからの同時学習を可能にするために、学習データの抽象化・ブレンド機能を提供します。
+
+(iii) ***DeepSpeed-RLHF システム***: DeepSpeedの学習・推論機能を統合した、RLHF用のハイブリッドエンジン DeepSpeed-HE を提供します。DeepSpeed-HE は、RLHFのパイプライン内で推論モードと訓練モードをシームレスに切り替えでき、テンソル並列や高性能なTransformerカーネルなど、DeepSpeed-Inferenceのさまざまな最適化技術を推論に活用できる一方、強化学習の訓練では、ZeROやLoRAベースの多数のメモリ最適化技術を利用します。また、DeepSpeed-HEはRLHFパイプラインに完全に適合した設計となっており、RLHFのさまざまなフェーズでメモリ管理やデータ移動の面で最適な技術を適用できます。
+
+DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見ない効率性を実現し、AIコミュニティが、複雑なRLHFの訓練を高速かつ安価に、そして容易に利用できるようにします：
+
+***実行効率とコスト***: 実行効率において、[DeepSpeed-HEは既存システムよりも15倍以上速く](#実効スループットとスケーラビリティ)、RLHFの訓練を高速かつ低コストに行うことができます。例えば、DeepSpeed-HEは、Azure Cloud上でOPT-13Bモデルをわずか9時間で、OPT-30Bを18時間で訓練でき、それぞれのコストは300ドル以下、600ドル以下です。
+
+<div align="center">
+
+| GPUs    |                           OPT-6.7B                           |             OPT-13B             | OPT-30B | OPT-66B |
+| ------- | :----------------------------------------------------------: | :------------------------------: | :-----: | :-----: |
+| 8x A100-40GB   |                           5.7 時間                           |            10.8 時間            | 1.85 日 |   NA   |
+| 8x A100-80GB | 4.1 時間 ($132)                         　  | 	9 時間 ($290) | 18 時間 ($580) | 2.1 日（$1620） |
+
+*表1. ノード1台（8x A100）を用いた場合の訓練時間とAzureでの概算実行コスト*
+
+</div>
+
+
+***高スケーラビリティ***: DeepSpeed-HEは、数千億のパラメータを持つモデルをサポートし、複数ノード・複数GPUのシステムで、優れたスケーラビリティを実現することができます。その結果、13Bのモデルであれば1.25時間で学習でき、175Bの巨大モデルでも、1日以内に学習できます。
+
+<div align="center">
+
+| GPUs         |              OPT-13B              |               OPT-30B               | OPT-66B | OPT-like-175B |
+| ------------ | :-------------------------------: | :---------------------------------: | :-----: | :-----------: |
+| 64x A100-80G | 1.25 時間 ($320)	| 4 時間 ($1024) | 7.5 時間 ($1920)	| 20 時間 ($5120) |        |              |
+
+*表2. 複数ノード（64x A100-80GB）を用いた場合の訓練時間とAzureでの概算実行コスト*
+</div>
+
+> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン（配列長256の13万件のクエリー）と6750万の生成トークン（配列長256の13万件の回答）があり、ステップごとの最大グローバルバッチサイズは 50万 トークン（クエリーと回答それぞれ1024件）です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/microsoft/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。
+
+***RLHFを誰もが利用できるように***: DeepSpeed-HEは、1台のGPUのみで130億以上のパラメーターを持つモデルの訓練を実行できます。複数のGPUを備えた高価な計算設備を持たないデータサイエンティストも、小規模なトイモデルではなく、実際のシナリオで使用できる大規模で強力なRLHFモデルを作成できます。
+
+<div align="center">
+
+|            | V100 32G | A6000 48G | A100 40G | A100 80G |
+| ---------- | :------: | :-------: | :------: | :------: |
+| モデルサイズ| OPT-2.7B | OPT-6.7B | OPT-6.7B | OPT-13B |
+
+表3. DeepSpeed-HEを用いて1台のGPUで訓練できるモデルサイズ
+
+</div>
+
+以降では、上記で紹介したDeepSpeed-Chatの3つの機能を深く掘り下げて紹介していきます。
+
+# 2. ChatGPTの訓練と推論を手軽に実行
+
+初めに、DeepSpeed-RLHFを用いた訓練の容易さを示す例として、OPT-13BとOPT-66Bのモデルを訓練できることを示します。訓練を実行できる時間が短い場合でも、コンシューマーグレードのGPU1つで、OPT-1.3Bモデルをわずか2時間で訓練することも可能です。DeepSpeed-ChatのAPIを用いて、カスタムされたパイプラインをどのように作るかも紹介していきます。
+
+## DeepSpeed-Chatの例を用いたChatGPTスタイルのモデルの訓練
+
+*a) スクリプトを一つ実行するだけで、RLHF訓練の全3ステージが実行され、ChatGPTライクなモデルを生成されるのを見てみましょう！*
+
+ここではアクターモデルとして事前学習済みOPT-13B、報酬モデルとしてOPT-350MをRLHFで訓練する例を示します。
+以下のスクリプトを実行すると、最終的に130億パラメータのChatGPTライクなモデルが生成されます。
+
+```python
+pip install deepspeed>=0.9.0
+
+git clone https://github.com/microsoft/DeepSpeedExamples.git
+cd DeepSpeedExamples/applications/DeepSpeed-Chat/
+pip install -r requirements.txt
+
+python train.py --actor-model facebook/opt-13b --reward-model facebook/opt-350m --deployment-type single_node
+```
+
+8台のA100を備えたDGXサーバを使う場合、130億パラメータのモデルを半日で訓練できます。以下は各ステップに要する時間の内訳です。
+
+<div align="center">
+
+| モデルサイズ| Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :----: | :----: | :----: | :----: |
+| アクターモデル: OPT-13B, 報酬モデル: OPT-350M | 2.5時間 | 0.25時間 | 10.8時間 | 13.6時間 |
+
+表4. 8台のA100を備えたDGXサーバ1台でのend-to-end処理時間の内訳
+
+</div>
+
+*b) 学習したモデルをDeepSpeed Chatの推論APIで試してみましょう！*
+
+学習させたモデルを組み込んで、試してみましょう。質問応答だけでなく、会話形式のやりとりも楽しめます。
+
+```
+Human:        Do you know Microsoft?
+Assistant:    Yes, I’m a big fan of Microsoft.  What about them?
+Human:        Can you explain it to a 6-year old child? I wonder how I should describe it
+Assistant:    Sure, I can try.  Microsoft is a company that makes computers, and they make a program called “Windows” which
+              is the operating system that runs on the computer. It’s like the “brain” of the computer.  It’s where all the
+              programs and files are stored. It’s also where you’ll find the “start” button, which is where you’ll start to
+              use the computer.
+```
+
+## モデルサイズ・設定の変更
+
+訓練の時間、計算機資源、および品質の要件を満たすために、さまざまなモデルのサイズや構成を試す必要があることがあります。DeepSpeed-Chatを使用すれば、簡単にそれが可能です。例えば、研究やビジネスのために、GPUクラスタでより大規模で高品質なモデルを訓練したい場合、希望するモデルサイズとGPU数を指定するだけです。以下は、アクターモデルのパラメータ数を66Bに、GPU数を64に指定する例です。
+
+```python
+python train.py --actor-model facebook/opt-66b --reward-model facebook/opt-350m --deployment-type multi_node
+```
+
+64台のA100 (80GBメモリ) GPUを使用する場合、9時間で660億パラメータのChatGPTモデルを訓練できます。
+
+<div align="center">
+
+| モデルサイズ                      | Step 1 | Step 2 | Step 3 | Total |
+| -------------------------------- | :-----: | :----: | :----: | :---: |
+|アクターモデル: OPT-66B, 報酬モデル: OPT-350M | 82分 | 5分 | 7.5時間 |  9時間  |
+
+表5. 8台のA100(80GBメモリ)を備えたDGXサーバ8台で660億パラメータのモデルを訓練する場合のend-to-end処理時間の内訳
+
+</div>
+
+1～2時間のコーヒータイムや昼休みに、DeepSpeed-Chatで小規模なトイモデルをトレーニングしてみるのも良いでしょう。例えば、コンシューマグレードのGPUでの訓練を動かしてみるため、1つのデータセットで1.3Bのモデルを訓練する例を用意しました。これなら、昼休みから戻ったときに、できあがったモデルのチェックポイントを試してみることができます。
+
+```python
+python train.py --actor-model facebook/opt-1.3b --reward-model facebook/opt-350m --deployment-type single_gpu
+```
+
+<div align="center">
+
+| モデルサイズ                       |  Step 1  |  Step 2  | Step 3 | Total |
+| --------------------------------- | :-------: | :------: | :----: | :---: |
+| Actor: OPT-1.3B, Reward: OPT-350M | 2900 秒| 670 秒 | 1.2時間 |  2.2時間  |
+
+*表6. コモディティグレードのGPU（A6000）1台でのend-to-end処理時間の内訳*
+
+</div>
+
+## DeepSpeed-ChatのAPIを用いたRLHFパイプラインのカスタマイズ
+
+DeepSpeed-Chatでは、設定を柔軟に指定できるAPIを提供しており、ユーザーが独自のRLHF訓練パイプラインを構築することができます。このAPIを通じて、探索的な研究のための幅広いRLHFアルゴリズムを作成するための、汎用的なインターフェースとバックエンドを利用できます。
+
+```python
+
+engine = DeepSpeedRLHFEngine(
+  actor_model_name_or_path=args.actor_model_name_or_path,
+  critic_model_name_or_path=args.critic_model_name_or_path,
+  tokenizer=tokenizer,
+  num_total_iters=num_total_iters,
+  args=args)
+
+trainer = DeepSpeedPPOTrainer(engine=engine, args=args)
+
+for prompt_batch in prompt_train_dataloader:
+  out = trainer.generate_experience(prompt_batch)
+  actor_loss, critic_loss = trainer.train_rlhf(out)
+```
+
+# 3. 完全なRLHF訓練パイプライン
+
+シームレスに訓練を実行するため、DeepSpeed-Chatには、InstructGPTと同様の完全なend-to-endの訓練パイプラインが組み込まれています（図1）。
+
+<div align="center">
+
+<img src="../../assets/images/ds-chat-overview.png" width="800px" alt="DeepSpeed-Chat!"/>
+
+*図1: DeepSpeed-Chatの訓練パイプラインの概要*
+
+</div>
+
+このパイプラインは、次の3つのステップに分かれています。
+
+* Step 1 教師付きファインチューニング（Supervised finetuning, SFT）: 様々なクエリに対する人間の回答を慎重に選択し、事前学習された言語モデルをファインチューニングします。
+* Step 2 報酬モデルのファインチューニング：同じクエリに対する複数の回答のランキングを、人間が提供したデータセットを用いて、別のモデル（報酬モデルと呼ばれ、通常はSFTより小さい）を学習します。
+* Step 3 RLHF訓練: Proximal Policy Optimization（PPO）アルゴリズムを用いて、報酬モデルからのフィードバックによりSFTモデルをさらにファインチューニングします。
+
+ステップ3では、さらにモデルの品質を向上させるため、以下の2つの機能を追加で使用することができます。
+
+* 指数移動平均 (EMA) 収集: EMAベースのモデルチェックポイントを最終評価に使用できます。
+* 混合学習: SQuAD2.0のような公開ベンチマークでのモデル品質低下を防ぐために、事前学習の指標（次の単語予測）とPPOの指標を混合して使用します。
+
+これらの2つの機能は、最近のオープンソースプロジェクトではしばしば省かれることがあります。しかし、InstructGPTによれば、EMAチェックポイントは一般に、従来の最終学習済みモデルよりも優れた応答品質を実現できます。また混合学習によって、学習前のベンチマーク解答能力を保持できます。DeepSpeed-Chatでは、InstructGPTで示されたのと同様の訓練を実施可能とするために、これらの機能を提供しています。
+
+また、InstructGPTと同様の内容を実施する機能に加え、研究者や開発者が複数のデータリソースを用いて独自のRLHFモデルを訓練するのを支援するため、以下の便利な機能も提供しています。
+
+* データの抽象化・ブレンド機能: モデルの品質を向上させるため、複数のデータセットでモデルを訓練することができます。このため、DeepSpeed-Chatは、以下の二つの機能も備えています。 1）異なるデータセットの形式を統一するための抽象データセット層、（2）複数のデータセットを適切にブレンドし、3つのトレーニングステージに分割するためのデータ分割・ブレンド機能。
+
+
+# 4. DeepSpeedハイブリッドエンジン – RLHF訓練のための基盤
+
+与えられた指示に基づいて学習するRLHFパイプラインのステップ1とステップ2は、大規模モデルの通常のファインチューニングと似ています。そのため、DeepSpeed-Chatでは、DeepSpeedのZeROの技術による最適化と、DeepSpeedの様々な並列化の柔軟な組み合わせによって、高いスケーラビリティと高速な学習を実現しています。一方、ステップ3は、パフォーマンスへの影響という点で、最も複雑な処理を行う部分です。学習の各反復で、 a)トークン/経験生成と訓練のためのインプットを生成するための推論フェーズ、b) アクターモデルと報酬モデルのパラメータ更新する訓練フェーズの２つのフェーズがあり、さらにそれらの間の相互作用とスケジューリングを効率的に処理する必要があります。 これらを実現するには、 (1) SFTと報酬モデルの複数のコピーをステージ3全体を通して利用するためのメモリ利用の最適化、 (2) ステージ3全体の速度に大きな影響を与える生成フェーズの高速化 という2つの課題があります。指数移動平均（EMA）収集と混合学習を使用する場合には、必要なメモリ量と処理時間はさらに増大します。
+
+これらの課題に取り組むため、我々はDeepSpeedの訓練と推論の全システム機能を統一した基盤機能を、ハイブリッドエンジン DeepSpeed-HE として構成しました。これは、訓練モードではオリジナルのDeepSpeedエンジンを活用し、生成/推論モードではDeepSpeedの推論エンジンを適用することで、ステージ3のRLHFの訓練を大幅に高速化します。図2に示すように、DeepSpeedの訓練エンジンと推論エンジンは、シームレスに切り替えられます。アクターモデルに対して推論モードや訓練モードを有効にしておけば、推論や訓練パイプラインを実行する際に、DeepSpeedがそれぞれに異なる最適化を選択して、システム全体のスループットを改善します。
+
+
+<div align="center">
+
+<img src="../../assets/images/hybrid-engine.png" width="600px" alt="DeepSpeed-Chat!"/>
+
+*図2. RLHFで最も処理時間がかかる部分を高速化するハイブリッドエンジン（DeepSpeed-HE）*
+
+</div>
+
+RLHF訓練の経験生成フェーズにおける推論では、DeepSpeed-HE は、KVキャッシュと中間結果を扱う軽量なメモリ管理システム、および推論のために高度に最適化されたカーネルと、テンソル並列機能により、既存のソフトウェアと比較してスループット（トークン/秒）を大幅に向上させています。
+
+また訓練では、DeepSpeedの一連のZeROの技術や、Low Rank Adaption（LoRA）などのメモリ最適化技術を利用できます。DeepSpeed-HEでは、非常に高い効率の訓練を実現するため、これらの複数の最適化技術を互いに組み合わせることが可能なように実装されています。
+
+DeepSpeed-HEは、訓練と推論の両方で、モデルの分割をシームレスに変更し、テンソル並列を使用した推論と、DeepSpeedのZeROの技術によるシャーディング機構を使用した訓練でサポートしています。また、メモリを最大限に活用するため、これらのモードごとにメモリの割り当てを再構成します。これにより、メモリ割り当てのボトルネックを回避するとともに、大規模なバッチサイズをサポートすることでパフォーマンスを向上させることができます。DeepSpeedの訓練や推論など、さまざまなシステム技術を集約したハイブリッドエンジンは、最新のRLHF訓練の限界を超えて、RLHFを比類ない規模と効率で実行可能にします。
+
+
+# 5. DeepSpeed RLHF: ハイブリッドエンジン DeepSpeed-HEによる類を見ないスケールと高い効率
+
+## 機能の概要
+
+これまでに説明してきたように、DeepSpeed-HEは、推論と学習のための強力な技術を融合するものです。幅広いハードウェアで、DeepSpeed-RLHFパイプラインの優れたスケーラビリティと高い実行効率を実現するように設計されており、RLHFの学習を高速かつ低コストで、AIコミュニティが簡単に利用できるようにします。
+
+表1は、異なるモデルサイズとGPUでの、実行効率と費用を示しています。DeepSpeed-HEを用いると、Azure Cloud上でOPT-13Bをわずか9時間、OPT-30Bを18時間で訓練でき、必要な費用はそれぞれ300ドル、600ドル以下です。スピードとスケーラビリティの面では、表2に示すように、13Bパラメータのモデルでも1.25時間で学習でき、64GPUのクラスタを使えば175Bの巨大モデルも1日以内に学習できます。また、誰もがRLHFを利用できるようにするという観点から、DeepSpeed-HEを用いると、表3に示すように、130億以上のパラメータを持つモデルを、1つのGPUで訓練することもできるようになっています。
+
+
+## 既存のRLHFシステムとのスループットとモデルサイズのスケーラビリティ比較
+
+DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHuggingFaceなどの他のRLHFを訓練可能なシステムと比較して、実行速度とスケーラビリティの両方で優れています。
+
+* スループットに関しては、DeepSpeedは単一GPUでのRLHFトレーニングで10倍以上の向上を実現しています（図3）。複数GPU環境では、Colossal-AIと比較して6～19倍、HuggingFace DDPと比較して1.4～10.5倍のスピードアップを実現しています（図4）。
+* モデルのスケーラビリティに関しては、Colossal-AIが最大で1.3Bのモデルを単一GPUで、6.7BのモデルをA100-40Gを備えた単一のノードで訓練できますが、DeepSpeed-HEは同じハードウェアでそれぞれ6.5Bと50Bのサイズのモデルを訓練できます。これは、最大で7.5倍のモデルサイズを扱えることになります。
+
+したがって、DeepSpeed-HEは、Colossal-AIやHuggingFace DDPなどの既存のRLHFシステムと比較して、1桁以上高いスループットを実現しており、同じ実行時間ではるかに大きなアクターモデルを訓練したり、10倍以上低いコストで同様のサイズのモデルを訓練することができます。
+
+<div align="center">
+
+<img src="../../assets/images/figure3.png" width="600px" />
+
+*図3. 他フレームワークとのStep 3のスループット比較（1台のA100-40Gを使用。バツ印はメモリ不足で実行できないことを示す）*
+
+</div>
+
+<div align="center">
+
+<img src="../../assets/images/figure4.png" width="600px" />
+
+*図4.  ステップ3（全3ステップ処理時間の大半を占める）のEnd-to-endの訓練スループット比較 (8台のA100-40Gを備えた1台のDGXノードを使用。バツ印はメモリ不足で実行できないことを示す）*
+
+</div>
+
+この効率化は、DeepSpeed-HEが、DeepSpeedの高度に最適化された推論機能を活用して、RLHF処理の生成フェーズを高速化したことに起因しています。図5は、1.3BパラメータモデルのRLHF訓練の時間内訳を示したもので、時間の大半は生成フェーズに費やされていることが分かります。DeepSpeedの高性能な推論カーネルを活用することで、DeepSpeed-HEはこのフェーズでHuggingFaceの9倍、Colossal-AIの15倍のスループット向上を達成し、end-to-endの類を見ない効率化を実現しています。
+
+<div align="center">
+
+<img src="../../assets/images/figure5.png" width="600px" />
+
+*図5. DeepSpeed-HEを用いた生成フェーズの高速化（OPT-1.3Bベースのアクターモデル + OPT-350Mベースの報酬モデル、8台のA100-40Gを備えた1台のDGXノードを使用）*
+
+</div>
+
+## 実効スループットとスケーラビリティ
+
+***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています（詳細は[ベンチマークのページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照）。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
+
+<div align="center">
+
+<img src="../../assets/images/Throughputs-OPT.png" width="600px" />
+
+*図6. DeepSpeed-HEを用いたRLHFにおける生成、訓練、および実効スループット（GPU数は最善の効率を得られるように設定）*
+
+</div>
+
+実効スループットを最大化するために、DeepSpeed-HEは、生成フェーズと強化学習の訓練フェーズの両方を最適化しています。まず、両フェーズでより高い効率を得るために、可能な限り大きなバッチサイズを使用します。次に、生成フェーズでは、高性能なTransformerのカーネルを活用して、モデルが単一のGPUメモリに収まる場合はGPUメモリ帯域幅の利用を最大化するとともに、メモリに収まらない場合はテンソル並列（Tensor parallelism）も併用します。生成フェーズでは、ZeROによる省メモリ化の代わりに、テンソル並列を使用することで、GPU間通信を減らし、GPUメモリ帯域幅の利用率を高く保つことができます。
+
+図6では、モデルサイズが1.3Bから175Bの範囲で、DeepSpeed-HEで達成可能な最良の実効スループットを、GPUあたりのTFlopsで示しています。また、生成と訓練の各フェーズで達成されたスループットも示しています。これを見ると、DeepSpeed-HEは、6.7B～66Bのモデルで高い効率を達成していることが分かります。この範囲を超えて175Bまでモデルを大きくすると、メモリが制限により、大きなバッチサイズが設定できなくなり、スループットが低下しますが、それでも1.3Bのモデルよりも1.2倍の効率性を達成しています。こうした巨大なモデルを学習する際のGPUあたりのスループットは、より大きなバッチサイズを扱えるように、より多くのメモリを搭載したGPUにスケールアップすれば、さらに向上する可能性があります。
+
+さらに、図4に示すように、我々の実効性能は既存システムの19倍であり、これは既存システムはピーク性能の5%以下で動作していることを示唆しています。これは、RLHFワークロードを最適化することの難しさとともに、我々のシステムがRLHFパイプラインにおいて有効であることを示しています。
+
+<div align="center">
+
+<img src="../../assets/images/figure7.png" width="600px" />
+
+*図7. DGXノード（ノードあたり8台のA100-40/80G）の数を増加させた場合のスケーラビリティ（13Bおよび66Bのアクターモデルと350Mの報酬モデルを使用）*
+
+</div>
+
+***(II) スケーラビリティ分析*** モデルサイズごとに、最良のスループットを得られるGPU数は異なります。これは、モデルサイズが大きくなると、実行に多くのメモリを必要とすることに加え、以下に説明する DeepSpeed-HE のスケーラビリティ特性にも起因しています。
+
+図7は、DeepSeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形（super linear）なスケーリングを達成し、大規模では線形（linear）またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。
+
+DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、利用可能なGPU間でモデルを分割することが可能です。その結果、GPUあたりのメモリ消費量はGPU数の増加とともに減少し、DeepSpeed-HEはGPUあたりでより大きなバッチサイズをサポートできるようになり、超線形のスケーリングが実現できます。しかし、より大規模になると、利用可能なメモリが増加し続ける一方で、最大グローバルバッチサイズが制限されているため、GPUあたりのバッチサイズを小さくすることになり、線形またはそれ以下のスケーリングになります。その結果、与えられた最大グローバルバッチサイズに対して、DeepSpeed-HEは、スーパーリニアとサブリニアのスケーラビリティの境界で最高のスループットとコスト効率を達成し、正確なポイントは、利用可能なメモリとグローバルバッチサイズの関数としてGPUごとに実行できる最大バッチサイズによってほぼ決定されます。
+
+
+# 6. DeepSpeed-Chatのリリース: さっそく試してみましょう！
+
+DeepSpeed-ChatをオープンソースソフトウェアとしてAIコミュニティに公開できることを嬉しく思います。
+
+* DeepSpeed-Chatの[GitHubページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。
+* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。
+
+
+# 7. DeepSpeedについて
+
+DeepSpeedは、きわめて大規模かつ高速な深層学習を、容易に実現するための様々な機能を持ったソフトウェアです。
+DeepSpeed-Chatは、DeepSpeedの一連のソフトウェアエコシステムの一部です。
+DeepSpeedは、以下のような機能を提供します。
+
+* 数十億～1兆規模のパラメータを持つdenseあるいはsparseなモデルの訓練と推論
+* 高いスループットと数千GPU規模のスケーラビリティ
+* 限られたGPUリソース環境における訓練と推論
+* 類のないレベルの低遅延かつ高スループットな推論
+* 高度なモデル圧縮技術による低遅延な推論とモデルサイズ削減
+
+DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。
+
+またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
+
+DeepSpeedについてのより詳しい情報は、以下をご覧ください。
+
+* [DeepSpeedのWebサイト](https://www.deepspeed.ai/) には、DeepSpeedの技術に関する詳細なブログ記事、チュートリアル、ドキュメントなどが掲載されています。
+* [DeepSpeedのTwitterアカウント (英語)](https://twitter.com/MSFTDeepSpeed) では、DeepSpeedの最新情報を発信していますので、ぜひフォローください。[日本語版のTwitterアカウント](https://twitter.com/MSFTDeepSpeedJP)もあり、最新の情報を日本語で発信しています。
+
+DeepSpeedチームは、ユーザの方々からのフィードバックやご連絡を受け付けています。
+
+* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/microsoft/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。
+* DeepSpeedチームでは、DeepSpeedを用いた深層学習の研究や実世界へのAIモデルやアプリケーションに関して、大学、研究所、企業との方々とのコラボレーションを行っています（日本語でコミュニケーション可能な研究員も在籍しています）。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については、deepspeed-info@microsoft.com まで直接メールをお送りください。
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 9f8f95c4a876fdd5883cceea70f809d46544e994..8eebe00349beff63f3736d57665775aaefcf7702 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -1,9 +1,9 @@
-#ifdef __HIPCC__
-#include "cpu_adagrad_hip.h"
-#else
-#include "cpu_adagrad.h"
-#endif
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
+// DeepSpeed Team
+
+#include "cpu_adagrad.h"
 #include <torch/extension.h>
 #include <iostream>
 #include <memory>
@@ -178,7 +178,7 @@ int ds_adagrad_step(int optimizer_id,
         std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
     opt->IncrementStep(step);
     opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.size(0));
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
 
 #if defined(__ENABLE_CUDA__)
     opt->SynchronizeStreams();
@@ -214,7 +214,7 @@ int ds_adagrad_step_plus_copy(int optimizer_id,
     opt->Step_8(params_ptr,
                 grads_ptr,
                 exp_avg_sq_ptr,
-                params_c.size(0),
+                params_c.numel(),
                 gpu_params_ptr,
                 (params.options().dtype() == at::kHalf));
 
diff --git a/csrc/adam/compat.h b/csrc/adam/compat.h
deleted file mode 100644
index 86f84a85065c9582119296223bb24193e71e060b..0000000000000000000000000000000000000000
--- a/csrc/adam/compat.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-*/
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index f17f22535ab8dfd56260daff7a2479e771f376a4..4d3d5a45e628a92ef67ee59d54b513e03babca7d 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include "cpu_adam.h"
 #include <torch/extension.h>
 #include <cassert>
@@ -230,7 +235,7 @@ int ds_adam_step(int optimizer_id,
                 grads_ptr,
                 exp_avg_ptr,
                 exp_avg_sq_ptr,
-                params_c.size(0),
+                params_c.numel(),
                 nullptr,
                 (params.options().dtype() == at::kHalf));
 
@@ -275,7 +280,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
                 grads_ptr,
                 exp_avg_ptr,
                 exp_avg_sq_ptr,
-                params_c.size(0),
+                params_c.numel(),
                 gpu_params_ptr,
                 (params.options().dtype() == at::kHalf));
 
diff --git a/csrc/adam/custom_cuda_kernel.cu b/csrc/adam/custom_cuda_kernel.cu
deleted file mode 100755
index 2f282aff1aca5d05e82ec8863c4edef14dc2de31..0000000000000000000000000000000000000000
--- a/csrc/adam/custom_cuda_kernel.cu
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-#include "custom_cuda_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
diff --git a/csrc/adam/fused_adam_frontend.cpp b/csrc/adam/fused_adam_frontend.cpp
index b06531c53002c1186dac8c7e1a168bfa72e31fef..13b390248608b046dab443f85346b5446a47d722 100644
--- a/csrc/adam/fused_adam_frontend.cpp
+++ b/csrc/adam/fused_adam_frontend.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 void multi_tensor_adam_cuda(int chunk_size,
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index 611d9ffbe375306d94356abfefe81b29442454d4..1b697d989b1a11dac5fd643f3f4497be578277e9 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <ATen/ATen.h>
@@ -12,11 +17,7 @@
 
 #include <assert.h>
 
-#ifdef __HIPCC__
-#include "multi_tensor_apply_hip.cuh"
-#else
 #include "multi_tensor_apply.cuh"
-#endif
 #include "type_shim.h"
 
 #define BLOCK_SIZE 512
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
index 13af4b7578f6db7db066b7ff7f17edd86e8fd6d3..12f41cb49c6bf505db48f1d21e312578f19da836 100644
--- a/csrc/adam/multi_tensor_apply.cuh
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <ATen/ATen.h>
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 9e405d8e704c3c260bd8276f648b770471bcd4b7..f35760a99a5cc8b2768f340f6d5a43af1afdd2b5 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -262,7 +264,7 @@ void report_file_error(const char* filename, const std::string file_op, const in
 
 int open_file(const char* filename, const bool read_op)
 {
-    const int flags = read_op ? (O_RDONLY | __O_DIRECT) : (O_WRONLY | O_CREAT | __O_DIRECT);
+    const int flags = read_op ? (O_RDONLY | O_DIRECT) : (O_WRONLY | O_CREAT | O_DIRECT);
     const int mode = 0600;
     const auto fd = open(filename, flags, mode);
     if (fd == -1) {
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index cc62d33765c804e88816791c72a3477278738e76..2940de945ee8f2347cb3e2b25f6057cc5fdc9dfd 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_types.cpp b/csrc/aio/common/deepspeed_aio_types.cpp
index e5811bb91149fad40422692ac7cde6f9348e0029..5e34a61065d4b0f16ba904d0e397eeccc149e621 100644
--- a/csrc/aio/common/deepspeed_aio_types.cpp
+++ b/csrc/aio/common/deepspeed_aio_types.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_types.h b/csrc/aio/common/deepspeed_aio_types.h
index be3b352d6be20733f7e03a821378a648384be0b5..ce6a4e5cdfa75120c24dad599a8bc717765c220d 100644
--- a/csrc/aio/common/deepspeed_aio_types.h
+++ b/csrc/aio/common/deepspeed_aio_types.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index e8bf9de1125907c898e21f52e81cef2cb08b131a..763b2c253a34c43501ec522a0570e068cdf11acc 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 6c5952749dd33d5e0059c209dc14ea755424da23..9c58c2286610b2b18b2168b42159880232c8dcf6 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index a2670fb7b4cbc6635c25d71e9b5f9bfd265f59c9..055db8798a6b5915e662a1756523928efb2c2fdf 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index d1cfcab8bfc2446921422b83efa100444ce0dd31..20799ecbb018eccfd7051b2ebf015695af35934a 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 20bdf5b142a63a88e3636d6375d62b30bf4fd841..752823dc7dd2cac7a006088f09d11ef208d18d63 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index a421bbc8f3b1ac155348a48387f40601dfe7c429..4350a4ac7df6754b497b2ab863ef02ee6df32426 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -1,11 +1,14 @@
-/*
-Copyright 2023 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for managing CPU tensors occupying page-locked memory.
 TODO: Implement a full-featured manager that
- 1. Avoid page-locked memory leaks
- 2. Minimize page-locked memory usage by reducing internal fragmentation
+1. Avoid page-locked memory leaks
+2. Minimize page-locked memory usage by reducing internal fragmentation
+Functionality for managing CPU tensors occupying page-locked memory.
 */
 
 #include <map>
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 49ff1f240c433288a0e12c64389887c65926ad83..387b713f2bfc2ba90fc4b3b677b2834623d166c2 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
index 230d88da9763a0130554ca83c5e3b1a5d914116f..11d5225de9f10855878060cd8228a3113534a5ea 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index cb81924ec7d354b4168a863f38a555a3ed139d3b..c21e92de94496cab5ca41bd9e32bb449c339f3ef 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 2163aafcfe89b1728ee7d8ed1421a56d648ccf8a..3a254c3814a2755108a4cd94fae85cf6f5495de3 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index ee51147f9c414b184bb6ef81edd8905ca7fd4a78..8a59107dd3474645dab614ef33b7b249d7ee97e4 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 69b044851eca1cbea461925fca2133f433e77533..19ba28317d008c40f54f0154ac2c75d8a885e227 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -1,3 +1,7 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /*
 Copyright 2020 The Microsoft DeepSpeed Team
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
index 3c971c667874568fa1547395a73143431f9c72f8..9033549bc0d2ebf3388ed159e52ede6cbc684ce1 100755
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -1,7 +1,9 @@
-/*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py
index caa833f5febbe26eabf3b155a236fa331899667c..09d0e03c7ef67654a3ae07296d1752b2970ad1c3 100644
--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -14,13 +15,10 @@ from perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        '--log_dir',
-        type=str,
-        default=BENCH_LOG_DIR,
-        help=
-        f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
+    parser.add_argument('--log_dir',
+                        type=str,
+                        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -75,9 +73,7 @@ def generate_aio_param(read_log_dir, write_log_dir):
     optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
     optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
 
-    print(
-        f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}'
-    )
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
     print(json.dumps(aio_param, indent=3))
 
 
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index eebea69b1bbf3963295eaf7429905cd9546011d1..7d55f7ded65c2241150a4b8f5ceb53ff6b15875b 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 import os
@@ -20,20 +21,16 @@ from deepspeed.ops.op_builder import AsyncIOBuilder
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K",
-                   "256K"],
-    "queue_depth": [4,
-                    16,
-                    32],
-    "overlap_events": [True,
-                       False],
-    "io_parallel": [2,
-                    8],
+    "block_size": ["128K", "256K"],
+    "queue_depth": [4, 16, 32],
+    "overlap_events": [True, False],
+    "io_parallel": [2, 8],
     "single_submit": [False]
 }
 
 
 class Job(object):
+
     def __init__(self, cmd_line, output_file=None, work_dir=None):
         self.cmd_line = cmd_line
         self.output_file = output_file
@@ -63,6 +60,7 @@ class Job(object):
 
 
 class SweepConfig(object):
+
     def __init__(self, args):
         self.nvme_dir = args.nvme_dir
         self.io_size = args.io_size
@@ -78,52 +76,35 @@ class SweepConfig(object):
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument(
-        '--nvme_dir',
-        required=True,
-        type=str,
-        help=
-        'Directory in which to perform I/O tests. A writeable directory on a NVMe device.'
-    )
-
-    parser.add_argument('--sweep_config',
+    parser.add_argument('--nvme_dir',
+                        required=True,
                         type=str,
-                        default=None,
-                        help='Performance sweep configuration json file.')
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
 
-    parser.add_argument('--no_read',
-                        action='store_true',
-                        help='Disable read performance measurements.')
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
 
-    parser.add_argument('--no_write',
-                        action='store_true',
-                        help='Disable write performance measurements.')
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
 
-    parser.add_argument(
-        '--io_size',
-        type=str,
-        default="400M",
-        help='Number of I/O bytes to read/write for performance measurements.')
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
+
+    parser.add_argument('--io_size',
+                        type=str,
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
 
     parser.add_argument(
         '--no_sudo',
         action='store_true',
         help=
-        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.'
-    )
+        'Run without sudo access. Page cache will not be flushed and reported read speeds may be higher than actual.')
 
     parser.add_argument(
         '--log_dir',
         type=str,
         default=BENCH_LOG_DIR,
-        help=
-        f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}'
-    )
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
 
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
 
     args = parser.parse_args()
     print(f'args = {args}')
@@ -147,6 +128,7 @@ def get_sweep_config_dict(sweep_config_json):
 
 
 def get_sweep_cmd_lines(sweep_config_dict):
+
     def flatten_options(key, value_list):
         flat_list = []
         for v in value_list:
@@ -170,11 +152,7 @@ def run_job(job):
     args = ' '.join(job.cmd())
     print(f'args = {args}')
     job.open_output_file()
-    proc = subprocess.run(args=args,
-                          shell=True,
-                          stdout=job.get_stdout(),
-                          stderr=job.get_stderr(),
-                          cwd=job.get_cwd())
+    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
     job.close_output_file()
     assert proc.returncode == 0, \
     f"This command failed: {job.cmd()}"
@@ -240,14 +218,7 @@ def get_log_file(io_op_desc, cmd_line):
             return tag_key
         return f'{tag_key}{value}'
 
-    tag_list = [
-        SINGLE_SUBMIT,
-        OVERLAP_EVENTS,
-        THREAD_COUNT,
-        IO_PARALLEL,
-        QUEUE_DEPTH,
-        BLOCK_SIZE
-    ]
+    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
     log_tags = [io_op_desc]
     cmd_tags = create_cmd_tags(cmd_line)
     for tag in tag_list:
@@ -298,16 +269,10 @@ def create_read_file(sweep_config):
     os.makedirs(read_folder, exist_ok=True)
     read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
     block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[
-        f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'
-    ])
-    print(
-        f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
+    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
     run_job(dd_job)
-    print(
-        f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....'
-    )
+    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
     return read_folder, read_file_name
 
 
@@ -319,20 +284,15 @@ def remove_folder(folder):
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
     read_folder, read_file_name = create_read_file(sweep_config)
     read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd
-                      for cmd in cmd_lines]
+    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(read_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
 
-    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=read_cmd_lines)
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
 
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
     remove_folder(read_folder)
 
@@ -342,20 +302,15 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
     os.makedirs(write_folder, exist_ok=True)
     write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
     write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd
-                       for cmd in cmd_lines]
+    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(write_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
 
-    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC,
-                                 log_dir=log_folder,
-                                 cmd_lines=write_cmd_lines)
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
 
-    launch_sweep(sweep_jobs=perf_jobs,
-                 sync_job=sync_job,
-                 flush_cache_job=flush_cache_job)
+    launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
     remove_folder(write_folder)
 
@@ -376,10 +331,7 @@ def main():
     cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
 
     if sweep_config.flush_cache:
-        flush_cache_job = Job(
-            cmd_line=['sudo',
-                      'bash -c',
-                      "'echo 1 > /proc/sys/vm/drop_caches'"])
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
     else:
         flush_cache_job = None
 
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index d7f034ad9c463c9a4ef851a6160784f705d7ff3a..ad2a4349cd0c0977edf1657afb659c35301c0730 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,14 +21,8 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(
-        torch.empty(num_bytes,
-                    dtype=torch.uint8,
-                    device='cpu'))
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
+    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     ctxt = {}
     ctxt['file'] = file
@@ -60,13 +55,8 @@ def post_basic(pool_params):
 def main_basic_read(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
-    AsyncIOBuilder().load().aio_read(ctxt['buffer'],
-                                     ctxt['file'],
-                                     args.block_size,
-                                     args.queue_depth,
-                                     args.single_submit,
-                                     args.overlap_events,
-                                     args.validate)
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                     args.single_submit, args.overlap_events, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -76,13 +66,8 @@ def main_basic_read(pool_params):
 def main_basic_write(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
-    AsyncIOBuilder().load().aio_write(ctxt['buffer'],
-                                      ctxt['file'],
-                                      args.block_size,
-                                      args.queue_depth,
-                                      args.single_submit,
-                                      args.overlap_events,
-                                      args.validate)
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                      args.single_submit, args.overlap_events, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index 7f0e44779cb15375978dd5fd1619fb162996b0d4..d35b2713edae857b2b6e9ba1d11862978c609d62 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,27 +21,17 @@ def pre_handle(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size,
-                                                args.queue_depth,
-                                                args.single_submit,
-                                                args.overlap_events,
-                                                io_parallel)
+    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                args.overlap_events, io_parallel)
     task_log(tid, f'Created deepspeed aio handle')
 
     if args.gpu:
-        buffer = torch.empty(num_bytes,
-                             dtype=torch.uint8,
-                             device=get_accelerator().device_name())
+        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
     else:
         if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(
-                torch.empty(num_bytes,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
         else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes,
-                                                  torch.empty(0,
-                                                              dtype=torch.uint8))
+            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
 
@@ -51,10 +42,7 @@ def pre_handle(args, tid, read_op):
     ctxt['buffer'] = buffer
     ctxt['elapsed_sec'] = 0
 
-    task_log(
-        tid,
-        f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}'
-    )
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     return ctxt
 
diff --git a/csrc/aio/py_test/parse_aio_stats.py b/csrc/aio/py_test/parse_aio_stats.py
index 2a3e64944bef5badd040c7253dcd36927a7be3db..09c79ada5b3699676a5c6415bf30909e8686f548 100755
--- a/csrc/aio/py_test/parse_aio_stats.py
+++ b/csrc/aio/py_test/parse_aio_stats.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -19,10 +20,7 @@ METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
-    parser.add_argument('--log_dir',
-                        type=str,
-                        required=True,
-                        help='Folder of statistics logs')
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
 
     parser.add_argument('--metric',
                         type=str,
@@ -125,10 +123,7 @@ def get_results(log_files, metric):
 
 
 def get_sorted_results(log_dir, metric):
-    log_files = [
-        f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir,
-                                                                      f))
-    ]
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
 
     log_files_path = [os.path.join(log_dir, f) for f in log_files]
     results = get_results(log_files_path, metric)
diff --git a/csrc/aio/py_test/perf_sweep_utils.py b/csrc/aio/py_test/perf_sweep_utils.py
index 78dd93b0a175762043e57b5773ca2dd3fa849687..e6832c1baa492d8dfc695ac251a606e49db82f35 100644
--- a/csrc/aio/py_test/perf_sweep_utils.py
+++ b/csrc/aio/py_test/perf_sweep_utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 SCRIPT_PREFIX = '_aio_bench'
 WRITE_OP_DESC = 'write'
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
index 7cb737d689e1b40d6d0ab099fbbfdf69bb4f207f..e6242cb357892498bfcf7b331e99124cc85b3349 100755
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -20,46 +21,29 @@ def parse_arguments():
 
     parser.add_argument('--write_file', type=str, default=None, help='Write file.')
 
-    parser.add_argument('--write_size',
-                        type=str,
-                        default=None,
-                        help='Number of bytes to write.')
+    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
 
     parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
 
     parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
 
-    parser.add_argument('--threads',
-                        type=int,
-                        default=1,
-                        help='Thread parallelism count.')
+    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
 
-    parser.add_argument(
-        '--single_submit',
-        action='store_true',
-        help=
-        'Submit I/O requests in singles (default is submit queue_depth amount at once.).'
-    )
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
 
     parser.add_argument('--overlap_events',
                         action='store_true',
                         help='Overlap I/O submission and completion requests.')
 
-    parser.add_argument('--validate',
-                        action='store_true',
-                        help='Perform validation in library.')
+    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
 
     parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
 
-    parser.add_argument('--loops',
-                        type=int,
-                        default=1,
-                        help='Count of operation repetitions')
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
 
-    parser.add_argument('--io_parallel',
-                        type=int,
-                        default=None,
-                        help='Per iop parallelism')
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
 
     parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
 
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
index a330e4cd1980917fe3e3d203db68ffd0807e0212..6aad114c0bdc3f4f0d10451e67f92ef483c36978 100755
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index 3b96c3c2ffba38a36e455eeb1d42f656288bc458..019ec05d49d31da890a59a3297a11f313a9becbb 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2021 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder
diff --git a/csrc/common/custom_cuda_kernel.cu b/csrc/common/custom_cuda_kernel.cu
index 4c56a2562bc344417667602d63bd4af3dba63179..425000ea553c6e5682428e4994eb35a88efbc6f3 100644
--- a/csrc/common/custom_cuda_kernel.cu
+++ b/csrc/common/custom_cuda_kernel.cu
@@ -1,11 +1,15 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #ifdef __HIPCC__
 #include "custom_hip_layers.h"
 #else
 #include "custom_cuda_layers.h"
 #endif
+
+
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/csrc/includes/StopWatch.h b/csrc/includes/StopWatch.h
index a5576047c96716a373913c73d864aa91c3abc75c..42e4eb886eaa086e9eff786601ea8cb784a5b4ed 100644
--- a/csrc/includes/StopWatch.h
+++ b/csrc/includes/StopWatch.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 #ifdef _WIN32
diff --git a/csrc/includes/Timer.h b/csrc/includes/Timer.h
index d325cae1c5310e492318f1a709d21d37cacb2289..b67e04171dafb38151f2cfadaa1a800a84a21667 100644
--- a/csrc/includes/Timer.h
+++ b/csrc/includes/Timer.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #ifndef __TIMER_H__
 #define __TIMER_H__
diff --git a/csrc/includes/compat.h b/csrc/includes/compat.h
index 86f84a85065c9582119296223bb24193e71e060b..6d54446d472e9dfa6141436f55e0e623be48acb4 100644
--- a/csrc/includes/compat.h
+++ b/csrc/includes/compat.h
@@ -1,6 +1,11 @@
-/* Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #ifndef TORCH_CHECK
diff --git a/csrc/includes/context.h b/csrc/includes/context.h
index a8968ba1f0b479481c0aea2e88d58ff805972513..3a9067dc3b9f730807b70d51d87d1b82c3c28055 100644
--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -43,9 +44,9 @@ inline int DS_GET_BLOCKS(const int N)
         1);
 }
 
-class Context {
+class TrainingContext {
 public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0)
+    TrainingContext() : _workspace(nullptr), _seed(42), _curr_offset(0)
     {
         curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
         curandSetPseudoRandomGeneratorSeed(_gen, 123);
@@ -56,15 +57,15 @@ public:
         }
     }
 
-    virtual ~Context()
+    virtual ~TrainingContext()
     {
         cublasDestroy(_cublasHandle);
         cudaFree(_workspace);
     }
 
-    static Context& Instance()
+    static TrainingContext& Instance()
     {
-        static Context _ctx;
+        static TrainingContext _ctx;
         return _ctx;
     }
 
diff --git a/csrc/includes/conversion_utils.h b/csrc/includes/conversion_utils.h
index f033e357f3b63e6e531fc6525fe110938fe2bdc3..0bd97055c3ebffba40c4855d1e8fbcfc1cf9dbab 100644
--- a/csrc/includes/conversion_utils.h
+++ b/csrc/includes/conversion_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -262,12 +263,16 @@ DS_D_INLINE float2 to(__nv_bfloat162 val)
 #endif
 
 /*********************  To Half Conversions *********************/
-//aiss
-//template <>
-//DS_D_INLINE __half to(double val)
-//{
-//    return __double2half(val);
-//}
+template <>
+DS_D_INLINE __half to(double val)
+{
+#ifdef __HIP_PLATFORM_HCC__
+    float val_f = __double2float_rn(val);
+    return __float2half(val_f);
+#else
+    return __double2half(val);
+#endif
+}
 template <>
 DS_D_INLINE __half to(float val)
 {
@@ -329,6 +334,11 @@ DS_D_INLINE __half2 to(float2 val)
 {
     return __float22half2_rn(val);
 }
+template <>
+DS_D_INLINE __half2 to(float val)
+{
+    return __float2half2_rn(val);
+}
 
 #ifdef BF16_AVAILABLE
 // No direct conversion
@@ -401,6 +411,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)
     return __float22bfloat162_rn(val);
 }
 template <>
+DS_D_INLINE __nv_bfloat162 to(float val)
+{
+    return __float2bfloat162_rn(val);
+}
+template <>
 DS_D_INLINE __nv_bfloat162 to(__half2 val)
 {
     return to<__nv_bfloat162>(to<float2>(val));
diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
index 0dda4f759ece406d905d07ea1f879891122285c3..ba40fcf7b62a78544ea3eedbcd91a6b6424ed613 100644
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -38,8 +39,8 @@ public:
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
         _buf_index = false;
 #endif
     }
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index e9e139aa849273666c2fce5d668cf0dc938d162a..4648aede93eece1f45f40c70a11947825fdc0dda 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -53,8 +54,8 @@ public:
         cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
         cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
 
-        _streams[0] = Context::Instance().GetCurrentStream();
-        _streams[1] = Context::Instance().GetNewStream();
+        _streams[0] = TrainingContext::Instance().GetCurrentStream();
+        _streams[1] = TrainingContext::Instance().GetNewStream();
         _buf_index = false;
 #endif
     }
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index 68e7f48e5083ba3990e0e0ab0121addf142e52be..b016832dc9b3a992f48b2c5a7addfc11e7a878b7 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index 7ac04154dc57396986c75e3f201023529732a9f8..265eb7b1244403e658c83753c4a079bae375e04e 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/dequantization_utils.h b/csrc/includes/dequantization_utils.h
index fea7505c8a13041c80b778be3434b1531fe2f6b5..37a7cdf27a014a4eae42b9400ae0c2a12132c449 100644
--- a/csrc/includes/dequantization_utils.h
+++ b/csrc/includes/dequantization_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"
diff --git a/csrc/includes/dropout.h b/csrc/includes/dropout.h
index 2fdf224952954a23548db7ddb4c49119626e8f83..c00943a97687c3514ff96f63b4b0540805e9b545 100644
--- a/csrc/includes/dropout.h
+++ b/csrc/includes/dropout.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h
index 10210663fcfb881af18ccdf9aa6c005f8fa6221f..61d4248465894f56237641d08d100511160187a2 100644
--- a/csrc/includes/ds_kernel_utils.h
+++ b/csrc/includes/ds_kernel_utils.h
@@ -1,6 +1,9 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
+/*
 Centralized header file for preprocessor macros and constants
 used throughout the codebase.
 */
diff --git a/csrc/includes/ds_transformer_cuda.h b/csrc/includes/ds_transformer_cuda.h
old mode 100755
new mode 100644
index d289a24c6b4cd37c83a0bbb192f3e437c77e3d6e..4dbd1d4773385db8e40a9e2f0ac8d6ebf2433c7a
--- a/csrc/includes/ds_transformer_cuda.h
+++ b/csrc/includes/ds_transformer_cuda.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index 0f2ece4aabd30caa7829edd5231abd330902ae78..8cf9ee9ef594ed29135f07e2a9fcee800dc064f7 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #ifndef __FEEDFORWARD_H__
 #define __FEEDFORWARD_H__
diff --git a/csrc/includes/gelu.h b/csrc/includes/gelu.h
index 07d431484482cc1a3450d929c2e58d07f4b1aed3..b75c1510bad65101604dff2aaebfb79d5d435531 100644
--- a/csrc/includes/gelu.h
+++ b/csrc/includes/gelu.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index 7ddb8b122798fa66bc2709ffdea7393a4ec796cd..cd9fbb5a4e17fed792637d0c13bf237de9520948 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/general_kernels.h b/csrc/includes/general_kernels.h
index 875df84195ea953e056fbccef8fc6e72e58656a0..28e2cbf2984f457542bd6ead0615d1ca7abc0b7d 100644
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cuda.h>
 #include <cuda_fp16.h>
diff --git a/csrc/includes/memory_access_utils.h b/csrc/includes/memory_access_utils.h
index e2cdcb6ca8312fe9a0a4db99fe453bd4f341dda2..6789714d27c7ecb952255486fccbcc7a1686a676 100644
--- a/csrc/includes/memory_access_utils.h
+++ b/csrc/includes/memory_access_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/normalize_layer.h b/csrc/includes/normalize_layer.h
index 092129362f368cb0825b81c057de2ec533ee5f4c..b9c719087a6ba0af693589f3c36da452b82e21d7 100644
--- a/csrc/includes/normalize_layer.h
+++ b/csrc/includes/normalize_layer.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
index 89867748280c3fef2b14cd39d42f897725fafa16..826797889ebbef4b07f1b21c0d090704a9be23b5 100644
--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
index 8b14d1dc8f67a52b4d1484cf2ee76e6d9d38334e..26db86ec1e0b0eeb46c5e81eff9c2071868251d5 100644
--- a/csrc/includes/quantization_utils.h
+++ b/csrc/includes/quantization_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "conversion_utils.h"
@@ -101,9 +102,9 @@ public:
         if (max == min) {
             scale = 1.0;
         } else {
-            scale = (1 << numBits) / (max - min);
+            scale = ((1 << numBits)) / (max - min);
         }
-        offset = -(1 << (numBits - 1)) - (min * scale);
+        offset = (max + min) / 2;
     }
 
     DS_D_INLINE int8_t quantize(__half val)
@@ -111,7 +112,7 @@ public:
         constexpr int32_t q_min = -(1 << (numBits - 1));
         constexpr int32_t q_max = (1 << (numBits - 1)) - 1;
 
-        float val_f = conversion::to<float>(val) * scale + offset;
+        float val_f = (conversion::to<float>(val) - offset) * scale;
         int32_t data_i32 = conversion::to<int32_t>(val_f);
         data_i32 = min(max(data_i32, q_min), q_max);
         return (int8_t)data_i32;
@@ -120,7 +121,7 @@ public:
     template <typename T>
     DS_D_INLINE T dequantize(int8_t val)
     {
-        const float val_deq_f = conversion::to<float>(val) * scale + offset;
+        const float val_deq_f = ((conversion::to<float>(val)) * scale) + offset;
         return conversion::to<__half>(val_deq_f);
     }
 
diff --git a/csrc/includes/quantizer.h b/csrc/includes/quantizer.h
index fe363c809607e8d805bae70355f12f7828295e7a..2ae10bad352739cb8e32b861d7bac9319df8f0a9 100644
--- a/csrc/includes/quantizer.h
+++ b/csrc/includes/quantizer.h
@@ -1,10 +1,16 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
index fabf19dea9111219b29cc3778a4b416f555c82f3..54427983b021be59f4bcc4f2764f0708561969cd 100644
--- a/csrc/includes/reduction_utils.h
+++ b/csrc/includes/reduction_utils.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index 61f6ffdf3cbcc99628ce03bd482e6835ee95437d..712dd5b32e966738aa8e3ba63967cd46467229c8 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/softmax.h b/csrc/includes/softmax.h
old mode 100755
new mode 100644
index 676a35d5292ab4462d755828d8a0034fa3c8b518..b6320a3f91bec40b63d7f96390a6031a61a9d2cc
--- a/csrc/includes/softmax.h
+++ b/csrc/includes/softmax.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 6a1d65cfc7baeb94d8f4dcc246fdb67ebac4acbb..cd126f4b05842cc4c4e8b3b24bb44de830e4219c 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
diff --git a/csrc/includes/type_shim.h b/csrc/includes/type_shim.h
index 9d28019be87745781058e493f491e9973caef23f..78ade5a06daa12cf628e9966ce2ba8424a6e96ba 100644
--- a/csrc/includes/type_shim.h
+++ b/csrc/includes/type_shim.h
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 /* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
 #include <ATen/ATen.h>
diff --git a/csrc/lamb/fused_lamb_cuda.cpp b/csrc/lamb/fused_lamb_cuda.cpp
index 7a142b13b00ccafbc102b5217c9567ec42384af7..4b5175e924bfb87054e6eb47874f4d4c1a38ac4f 100644
--- a/csrc/lamb/fused_lamb_cuda.cpp
+++ b/csrc/lamb/fused_lamb_cuda.cpp
@@ -1,4 +1,8 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 // CUDA forward declaration
diff --git a/csrc/lamb/fused_lamb_cuda_kernel.cu b/csrc/lamb/fused_lamb_cuda_kernel.cu
index a957eb6287515d26f96400936e0ed1aa8f54b0e8..ca94a8e5ec2c2e888a4937f6ecf515dccceb694f 100644
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@@ -1,4 +1,8 @@
-/* Copyright 2019 The Microsoft DeepSpeed Team */
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
diff --git a/csrc/quantization/dequantize.cu b/csrc/quantization/dequantize.cu
index 3185b5f9c98ca619353ffcc330d4441f7a1a105b..783ffb7a41e396d73e9c0254826012a1ddc6655c 100644
--- a/csrc/quantization/dequantize.cu
+++ b/csrc/quantization/dequantize.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "dequantization_utils.h"
 #include "memory_access_utils.h"
diff --git a/csrc/quantization/fake_quantizer.cu b/csrc/quantization/fake_quantizer.cu
index 5d23c6ea4a6bce86d6437fc02d7d82d544878fb5..0f53e52359214a345245ca9e1902edcff75e32d2 100644
--- a/csrc/quantization/fake_quantizer.cu
+++ b/csrc/quantization/fake_quantizer.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <math.h>
 #include "custom_cuda_layers.h"
@@ -456,7 +457,7 @@ void launch_sr_fake_quantize_kernel(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
@@ -1010,7 +1011,7 @@ void launch_sr_fake_quantize_kernel_asym(T* vals,
     dim3 grid_dim(group_num);
 
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     sr_fake_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
         vals, (total_count / group_num) / 4, group_num, num_bits, seed);
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index 1465cabd296140114661a1a1ef7b3a003827eb83..ccc0c15be1a668621a23411f09356dceb2ffa1ee 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include <cassert>
diff --git a/csrc/quantization/quantize.cu b/csrc/quantization/quantize.cu
index b9c925c19f8759c62da634c8a8b8f74974ac4cc7..b04d0e968ba58a203f2b5e24790053cbd43bc74a 100644
--- a/csrc/quantization/quantize.cu
+++ b/csrc/quantization/quantize.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "ds_kernel_utils.h"
 #include "memory_access_utils.h"
diff --git a/csrc/quantization/quantizer.cu b/csrc/quantization/quantizer.cu
deleted file mode 100644
index 37883410e976c6daaece041d86a8a6e78838a00d..0000000000000000000000000000000000000000
--- a/csrc/quantization/quantizer.cu
+++ /dev/null
@@ -1,1037 +0,0 @@
-#include <math.h>
-#include "custom_cuda_layers.h"
-
-namespace cg = cooperative_groups;
-
-__global__ void quantize_kernel(__half* vals, int group_size, int num_bits)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    float2 data[MAX_REG];
-
-    int group_id = blockIdx.x;
-
-    {
-        int group_index = id;
-        int reg_count = 0;
-        int offset = group_id * group_size;
-        float max = -10000.0;
-
-        while (group_index < group_size && reg_count < MAX_REG) {
-            data[reg_count] = vals_cast[offset + group_index];
-            __half* data_h = reinterpret_cast<__half*>(&data[reg_count]);
-
-            if (abs((float)data_h[0]) > max) max = abs((float)data_h[0]);
-            if (abs((float)data_h[1]) > max) max = abs((float)data_h[1]);
-            if (abs((float)data_h[2]) > max) max = abs((float)data_h[2]);
-            if (abs((float)data_h[3]) > max) max = abs((float)data_h[3]);
-
-            group_index += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale = (1 << num_bits) / (2 * max + 1e-5);
-        float q_scale_inv = 1 / q_scale;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + id;
-            if (group_index < group_size) {
-                __half2* data_h = reinterpret_cast<__half2*>(&data[i]);
-                float2 q_data[2];
-                q_data[0] = __half22float2(data_h[0]);
-                q_data[1] = __half22float2(data_h[1]);
-
-                float2 q_data_int[2];
-
-                q_data_int[0].x = roundf(q_data[0].x * q_scale);
-                q_data_int[0].y = roundf(q_data[0].y * q_scale);
-                q_data_int[1].x = roundf(q_data[1].x * q_scale);
-                q_data_int[1].y = roundf(q_data[1].y * q_scale);
-
-                q_data_int[0].x *= q_scale_inv;
-                q_data_int[0].y *= q_scale_inv;
-                q_data_int[1].x *= q_scale_inv;
-                q_data_int[1].y *= q_scale_inv;
-
-                data_h[0] = __float22half2_rn(q_data_int[0]);
-                data_h[1] = __float22half2_rn(q_data_int[1]);
-
-                vals_cast[offset + group_index] = data[i];
-            }
-        }
-    }
-#endif
-}
-
-__global__ void quantize_kernel(float* vals, int group_size, int num_bits)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[MAX_REG];
-
-    int bid = blockIdx.x;
-
-    int group_index = bid * group_size + id;
-    int reg_count = 0;
-
-    float max = -10000.0;
-
-    while (id < group_size && reg_count < MAX_REG) {
-        float4 data_reg = vals_cast[group_index];
-        data[reg_count] = data_reg;
-
-        if (abs(data_reg.x) > max) max = abs(data_reg.x);
-        if (abs(data_reg.y) > max) max = abs(data_reg.y);
-        if (abs(data_reg.z) > max) max = abs(data_reg.z);
-        if (abs(data_reg.w) > max) max = abs(data_reg.w);
-
-        group_index += blockDim.x;
-        id += blockDim.x;
-        reg_count++;
-    }
-    id = threadIdx.x;
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(max, i);
-        if (max < temp) max = temp;
-    }
-    __shared__ float partialMax[WARP_SIZE];
-
-    if (lane == 0) partialMax[gid] = max;
-
-    b.sync();
-
-    if (lane < warp_num) max = partialMax[lane];
-
-    b.sync();
-
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(max, i);
-        if (max < temp) max = temp;
-    }
-
-    max = g.shfl(max, 0);
-
-    float q_scale = (1 << num_bits) / (2 * max + 1e-5);
-    float q_scale_inv = 1 / q_scale;
-    for (int i = 0; i < reg_count; i++) {
-        group_index = i * blockDim.x + id;
-        if (group_index < group_size) {
-            float4 q_data;
-            q_data = data[i];
-
-            float4 q_data_int;
-            q_data_int.x = roundf(q_data.x * q_scale);
-            q_data_int.y = roundf(q_data.y * q_scale);
-            q_data_int.w = roundf(q_data.w * q_scale);
-            q_data_int.z = roundf(q_data.z * q_scale);
-
-            q_data.x = q_data_int.x * q_scale_inv;
-            q_data.y = q_data_int.y * q_scale_inv;
-            q_data.w = q_data_int.w * q_scale_inv;
-            q_data.z = q_data_int.z * q_scale_inv;
-
-            vals_cast[group_index + bid * group_size] = q_data;
-        }
-    }
-}
-
-template <typename T>
-void launch_quantize_kernel(T* vals,
-                            int total_count,
-                            int group_num,
-                            int num_bits,
-                            cudaStream_t stream)
-{
-    dim3 grid_dim(group_num);
-    dim3 block_dim(1024);
-
-    quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        vals, (total_count / group_num) / 4, num_bits);
-}
-
-template void launch_quantize_kernel(float* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     cudaStream_t stream);
-template void launch_quantize_kernel(__half* vals,
-                                     int total_count,
-                                     int group_num,
-                                     int num_bits,
-                                     cudaStream_t stream);
-
-__global__ void sr_quantize_kernel(__half* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    __half2 data_low[128];
-    __half2 data_high[128];
-
-    int bid = blockIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-    unsigned int tid = threadIdx.x;
-    int reg_count = 0;
-    int offset = bid * token_size;
-    int group_index = bid * token_size + tid;
-
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        // float min = 10000.0;
-        float max = -10000.0;
-        while (tid < token_size) {
-            float2 data = vals_cast[offset + tid];
-            __half2* data_h = reinterpret_cast<__half2*>(&data);
-            data_low[reg_count] = data_h[0];
-            data_high[reg_count] = data_h[1];
-
-            float2 data_f[2];
-            data_f[0] = __half22float2(data_h[0]);
-            data_f[1] = __half22float2(data_h[1]);
-
-            if (abs((float)data_f[0].x) > max) max = abs((float)data_f[0].x);
-            if (abs((float)data_f[0].y) > max) max = abs((float)data_f[0].y);
-            if (abs((float)data_f[1].x) > max) max = abs((float)data_f[1].x);
-            if (abs((float)data_f[1].y) > max) max = abs((float)data_f[1].y);
-
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale_val = (float)(1 << num_bits) / (max * 2 + 1e-5);
-        float high_q = (float)((1 << (num_bits - 1)) - 1);
-        float low_q = (float)(-((1 << (num_bits - 1))));
-
-        for (int i = 0; i < reg_count; i++) {
-            int token_index = i * blockDim.x + threadIdx.x;
-            if (token_index < token_size) {
-                float2 data_f[2];
-                data_f[0] = __half22float2(data_low[i]);
-                data_f[1] = __half22float2(data_high[i]);
-
-                float2 q_data_int[2];
-                q_data_int[0].x = (float)((int)(data_f[0].x * q_scale_val));
-                q_data_int[0].y = (float)((int)(data_f[0].y * q_scale_val));
-                q_data_int[1].x = (float)((int)(data_f[1].x * q_scale_val));
-                q_data_int[1].y = (float)((int)(data_f[1].y * q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = curand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(data_f[0].x - (q_data_int[0].x / q_scale_val)) * q_scale_val;
-                q_error[1] = abs(data_f[0].y - (q_data_int[0].y / q_scale_val)) * q_scale_val;
-                q_error[2] = abs(data_f[1].x - (q_data_int[1].x / q_scale_val)) * q_scale_val;
-                q_error[3] = abs(data_f[1].y - (q_data_int[1].y / q_scale_val)) * q_scale_val;
-
-                q_data_int[0].x =
-                    (rand.x < q_error[0] && q_data_int[0].x > low_q && q_data_int[0].x < high_q)
-                        ? (q_data_int[0].x + (data_f[0].x > 0 ? 1 : -1))
-                        : q_data_int[0].x;
-                q_data_int[0].y =
-                    (rand.y < q_error[1] && q_data_int[0].y > low_q && q_data_int[0].y < high_q)
-                        ? (q_data_int[0].y + (data_f[0].y > 0 ? 1 : -1))
-                        : q_data_int[0].y;
-                q_data_int[1].x =
-                    (rand.w < q_error[2] && q_data_int[1].x > low_q && q_data_int[1].x < high_q)
-                        ? (q_data_int[1].x + (data_f[1].x > 0 ? 1 : -1))
-                        : q_data_int[1].x;
-                q_data_int[1].y =
-                    (rand.z < q_error[3] && q_data_int[1].y > low_q && q_data_int[1].y < high_q)
-                        ? (q_data_int[1].y + (data_f[1].y > 0 ? 1 : -1))
-                        : q_data_int[1].y;
-
-                data_f[0].x = q_data_int[0].x / q_scale_val;
-                data_f[0].y = q_data_int[0].y / q_scale_val;
-                data_f[1].x = q_data_int[1].x / q_scale_val;
-                data_f[1].y = q_data_int[1].y / q_scale_val;
-
-                float2 result;
-                __half2* result_h = reinterpret_cast<__half2*>(&result);
-                result_h[0] = __float22half2_rn(data_f[0]);
-                result_h[1] = __float22half2_rn(data_f[1]);
-
-                vals_cast[offset + token_index] = result;
-            }
-        }
-    }
-#endif
-}
-
-__global__ void sr_quantize_kernel(float* vals,
-                                   int token_size,
-                                   int token_num,
-                                   int num_bits,
-                                   std::pair<uint64_t, uint64_t> seed)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    int idx = blockIdx.x * blockDim.x + id;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[128];
-
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    int group_index = bid * token_size + threadIdx.x;
-    int reg_count = 0;
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        // float min = 10000.0;
-        float max = -10000.0;
-
-        while (tid < token_size) {
-            data[reg_count] = vals_cast[group_index];
-
-            if (abs(data[reg_count].x) > max) max = abs(data[reg_count].x);
-            if (abs(data[reg_count].y) > max) max = abs(data[reg_count].y);
-            if (abs(data[reg_count].z) > max) max = abs(data[reg_count].z);
-            if (abs(data[reg_count].w) > max) max = abs(data[reg_count].w);
-
-            group_index += blockDim.x;
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-        __shared__ float partialMax[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-
-        max = g.shfl(max, 0);
-
-        float q_scale_val = (float)(1 << num_bits) / (max * 2 + 1e-5);
-        float high_q = (float)((1 << (num_bits - 1)) - 1);
-        float low_q = (float)(-((1 << (num_bits - 1))));
-
-        int offset = (bid)*token_size;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + threadIdx.x;
-            if (group_index < token_size) {
-                float4 q_data = data[i];
-
-                float4 q_data_int;
-                q_data_int.x = (float)((int)(q_data.x * q_scale_val));
-                q_data_int.y = (float)((int)(q_data.y * q_scale_val));
-                q_data_int.w = (float)((int)(q_data.w * q_scale_val));
-                q_data_int.z = (float)((int)(q_data.z * q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = curand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(q_data.x - (q_data_int.x / q_scale_val)) * q_scale_val;
-                q_error[1] = abs(q_data.y - (q_data_int.y / q_scale_val)) * q_scale_val;
-                q_error[2] = abs(q_data.w - (q_data_int.w / q_scale_val)) * q_scale_val;
-                q_error[3] = abs(q_data.z - (q_data_int.z / q_scale_val)) * q_scale_val;
-
-                q_data_int.x =
-                    (rand.x < q_error[0] && q_data_int.x > low_q && q_data_int.x < high_q)
-                        ? (q_data_int.x + (q_data.x > 0 ? 1 : -1))
-                        : q_data_int.x;
-                q_data_int.y =
-                    (rand.y < q_error[1] && q_data_int.y > low_q && q_data_int.y < high_q)
-                        ? (q_data_int.y + (q_data.y > 0 ? 1 : -1))
-                        : q_data_int.y;
-                q_data_int.w =
-                    (rand.w < q_error[2] && q_data_int.w > low_q && q_data_int.w < high_q)
-                        ? (q_data_int.w + (q_data.w > 0 ? 1 : -1))
-                        : q_data_int.w;
-                q_data_int.z =
-                    (rand.z < q_error[3] && q_data_int.z > low_q && q_data_int.z < high_q)
-                        ? (q_data_int.z + (q_data.z > 0 ? 1 : -1))
-                        : q_data_int.z;
-
-                q_data_int.x /= q_scale_val;
-                q_data_int.y /= q_scale_val;
-                q_data_int.w /= q_scale_val;
-                q_data_int.z /= q_scale_val;
-
-                vals_cast[group_index + offset] = q_data_int;
-            }
-        }
-    }
-}
-
-template <typename T>
-void launch_sr_quantize_kernel(T* vals,
-                               int total_count,
-                               int group_num,
-                               int num_bits,
-                               cudaStream_t stream)
-{
-    dim3 block_dim(1024);
-    dim3 grid_dim(group_num);
-
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
-}
-template void launch_sr_quantize_kernel(float* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        cudaStream_t stream);
-template void launch_sr_quantize_kernel(__half* vals,
-                                        int total_count,
-                                        int group_num,
-                                        int num_bits,
-                                        cudaStream_t stream);
-
-__global__ void quantize_kernel_asym(__half* vals, int group_size, int num_bits)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    float2 data[MAX_REG];
-
-    int group_id = blockIdx.x;
-
-    {
-        int group_index = id;
-        int reg_count = 0;
-        int offset = group_id * group_size;
-        float max = -10000.0;
-        float min = 10000.0;
-
-        while (group_index < group_size && reg_count < MAX_REG) {
-            data[reg_count] = vals_cast[offset + group_index];
-            __half* data_h = reinterpret_cast<__half*>(&data[reg_count]);
-
-            if (((float)data_h[0]) > max) max = (float)data_h[0];
-            if (((float)data_h[1]) > max) max = (float)data_h[1];
-            if (((float)data_h[2]) > max) max = (float)data_h[2];
-            if (((float)data_h[3]) > max) max = (float)data_h[3];
-
-            if (((float)data_h[0]) < min) min = (float)data_h[0];
-            if (((float)data_h[1]) < min) min = (float)data_h[1];
-            if (((float)data_h[2]) < min) min = (float)data_h[2];
-            if (((float)data_h[3]) < min) min = (float)data_h[3];
-
-            group_index += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float q_scale_inv = 1 / q_scale;
-
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + id;
-            if (group_index < group_size) {
-                __half2* data_h = reinterpret_cast<__half2*>(&data[i]);
-                float2 q_data[2];
-                q_data[0] = __half22float2(data_h[0]);
-                q_data[1] = __half22float2(data_h[1]);
-
-                float2 q_data_int[2];
-
-                q_data_int[0].x = roundf((q_data[0].x - min) * q_scale_inv);
-                q_data_int[0].y = roundf((q_data[0].y - min) * q_scale_inv);
-                q_data_int[1].x = roundf((q_data[1].x - min) * q_scale_inv);
-                q_data_int[1].y = roundf((q_data[1].y - min) * q_scale_inv);
-
-                q_data_int[0].x = q_data_int[0].x * q_scale + min;
-                q_data_int[0].y = q_data_int[0].y * q_scale + min;
-                q_data_int[1].x = q_data_int[1].x * q_scale + min;
-                q_data_int[1].y = q_data_int[1].y * q_scale + min;
-
-                data_h[0] = __float22half2_rn(q_data_int[0]);
-                data_h[1] = __float22half2_rn(q_data_int[1]);
-
-                vals_cast[offset + group_index] = data[i];
-            }
-        }
-    }
-#endif
-}
-
-__global__ void quantize_kernel_asym(float* vals, int group_size, int num_bits)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[MAX_REG];
-
-    int bid = blockIdx.x;
-
-    int group_index = bid * group_size + id;
-    int reg_count = 0;
-
-    float max = -10000.0;
-    float min = 10000.0;
-
-    while (id < group_size && reg_count < MAX_REG) {
-        float4 data_reg = vals_cast[group_index];
-        data[reg_count] = data_reg;
-
-        if (data_reg.x > max) max = data_reg.x;
-        if (data_reg.y > max) max = data_reg.y;
-        if (data_reg.w > max) max = data_reg.w;
-        if (data_reg.z > max) max = data_reg.z;
-
-        if (data_reg.x < min) min = data_reg.x;
-        if (data_reg.y < min) min = data_reg.y;
-        if (data_reg.w < min) min = data_reg.w;
-        if (data_reg.z < min) min = data_reg.z;
-
-        group_index += blockDim.x;
-        id += blockDim.x;
-        reg_count++;
-    }
-    id = threadIdx.x;
-
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(max, i);
-        if (max < temp) max = temp;
-    }
-
-#pragma unroll
-    for (int i = 1; i < WARP_SIZE; i <<= 1) {
-        auto temp = g.shfl_xor(min, i);
-        if (min > temp) min = temp;
-    }
-
-    __shared__ float partialMax[WARP_SIZE];
-    __shared__ float partialMin[WARP_SIZE];
-
-    if (lane == 0) partialMax[gid] = max;
-    if (lane == 0) partialMin[gid] = min;
-
-    b.sync();
-
-    if (lane < warp_num) max = partialMax[lane];
-    if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(max, i);
-        if (max < temp) max = temp;
-    }
-#pragma unroll
-    for (int i = 1; i < warp_num; i <<= 1) {
-        auto temp = g.shfl_down(min, i);
-        if (min > temp) min = temp;
-    }
-
-    max = g.shfl(max, 0);
-    min = g.shfl(min, 0);
-
-    float q_scale = ((max - min) + 1e-5) / (float)(1 << num_bits);
-    float q_scale_inv = 1 / q_scale;
-    for (int i = 0; i < reg_count; i++) {
-        group_index = i * blockDim.x + id;
-        if (group_index < group_size) {
-            float4 q_data;
-            q_data = data[i];
-
-            float4 q_data_int;
-            q_data_int.x = roundf((q_data.x - min) * q_scale_inv);
-            q_data_int.y = roundf((q_data.y - min) * q_scale_inv);
-            q_data_int.w = roundf((q_data.w - min) * q_scale_inv);
-            q_data_int.z = roundf((q_data.z - min) * q_scale_inv);
-
-            q_data.x = q_data_int.x * q_scale + min;
-            q_data.y = q_data_int.y * q_scale + min;
-            q_data.w = q_data_int.w * q_scale + min;
-            q_data.z = q_data_int.z * q_scale + min;
-
-            vals_cast[group_index + bid * group_size] = q_data;
-        }
-    }
-}
-
-template <typename T>
-void launch_quantize_kernel_asym(T* vals,
-                                 int total_count,
-                                 int group_num,
-                                 int num_bits,
-                                 cudaStream_t stream)
-{
-    dim3 grid_dim(group_num);
-    dim3 block_dim(1024);
-
-    quantize_kernel_asym<<<grid_dim, block_dim, 0, stream>>>(
-        vals, (total_count / group_num) / 4, num_bits);
-}
-
-template void launch_quantize_kernel_asym(float* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          cudaStream_t stream);
-template void launch_quantize_kernel_asym(__half* vals,
-                                          int total_count,
-                                          int group_num,
-                                          int num_bits,
-                                          cudaStream_t stream);
-
-__global__ void sr_quantize_kernel_asym(__half* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
-{
-#if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    float2* vals_cast = reinterpret_cast<float2*>(vals);
-
-    __half2 data_low[128];
-    __half2 data_high[128];
-
-    int bid = blockIdx.x;
-
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-    unsigned int tid = threadIdx.x;
-    int reg_count = 0;
-    int offset = bid * token_size;
-    int group_index = bid * token_size + tid;
-
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        float min = 10000.0;
-        float max = -10000.0;
-        while (tid < token_size) {
-            float2 data = vals_cast[offset + tid];
-            __half2* data_h = reinterpret_cast<__half2*>(&data);
-            data_low[reg_count] = data_h[0];
-            data_high[reg_count] = data_h[1];
-
-            float2 data_f[2];
-            data_f[0] = __half22float2(data_h[0]);
-            data_f[1] = __half22float2(data_h[1]);
-
-            if (((float)data_f[0].x) > max) max = (float)data_f[0].x;
-            if (((float)data_f[0].y) > max) max = (float)data_f[0].y;
-            if (((float)data_f[1].x) > max) max = (float)data_f[1].x;
-            if (((float)data_f[1].y) > max) max = (float)data_f[1].y;
-
-            if (((float)data_f[0].x) < min) min = (float)data_f[0].x;
-            if (((float)data_f[0].y) < min) min = (float)data_f[0].y;
-            if (((float)data_f[1].x) < min) min = (float)data_f[1].x;
-            if (((float)data_f[1].y) < min) min = (float)data_f[1].y;
-
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale_val = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float q_scale_val_inv = 1 / q_scale_val;
-        float high_q = (float)((1 << num_bits) - 1);
-
-        for (int i = 0; i < reg_count; i++) {
-            int token_index = i * blockDim.x + threadIdx.x;
-            if (token_index < token_size) {
-                float2 data_f[2];
-                data_f[0] = __half22float2(data_low[i]);
-                data_f[1] = __half22float2(data_high[i]);
-
-                float2 q_data_int[2];
-                q_data_int[0].x = (float)((unsigned int)((data_f[0].x - min) * q_scale_val_inv));
-                q_data_int[0].y = (float)((unsigned int)((data_f[0].y - min) * q_scale_val_inv));
-                q_data_int[1].x = (float)((unsigned int)((data_f[1].x - min) * q_scale_val_inv));
-                q_data_int[1].y = (float)((unsigned int)((data_f[1].y - min) * q_scale_val_inv));
-
-                // Stochastic rounding
-                float4 rand = curand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] =
-                    abs(data_f[0].x - ((q_data_int[0].x * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[1] =
-                    abs(data_f[0].y - ((q_data_int[0].y * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[2] =
-                    abs(data_f[1].x - ((q_data_int[1].x * q_scale_val) + min)) * q_scale_val_inv;
-                q_error[3] =
-                    abs(data_f[1].y - ((q_data_int[1].y * q_scale_val) + min)) * q_scale_val_inv;
-
-                q_data_int[0].x = (rand.x < q_error[0] && q_data_int[0].x < high_q)
-                                      ? (q_data_int[0].x + 1)
-                                      : q_data_int[0].x;
-                q_data_int[0].y = (rand.y < q_error[1] && q_data_int[0].y < high_q)
-                                      ? (q_data_int[0].y + 1)
-                                      : q_data_int[0].y;
-                q_data_int[1].x = (rand.w < q_error[2] && q_data_int[1].x < high_q)
-                                      ? (q_data_int[1].x + 1)
-                                      : q_data_int[1].x;
-                q_data_int[1].y = (rand.z < q_error[3] && q_data_int[1].y < high_q)
-                                      ? (q_data_int[1].y + 1)
-                                      : q_data_int[1].y;
-
-                data_f[0].x = q_data_int[0].x * q_scale_val + min;
-                data_f[0].y = q_data_int[0].y * q_scale_val + min;
-                data_f[1].x = q_data_int[1].x * q_scale_val + min;
-                data_f[1].y = q_data_int[1].y * q_scale_val + min;
-
-                float2 result;
-                __half2* result_h = reinterpret_cast<__half2*>(&result);
-                result_h[0] = __float22half2_rn(data_f[0]);
-                result_h[1] = __float22half2_rn(data_f[1]);
-
-                vals_cast[offset + token_index] = result;
-            }
-        }
-    }
-#endif
-}
-
-__global__ void sr_quantize_kernel_asym(float* vals,
-                                        int token_size,
-                                        int token_num,
-                                        int num_bits,
-                                        std::pair<uint64_t, uint64_t> seed)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int gid = threadIdx.x >> 5;
-    int lane = threadIdx.x & 0x1f;
-    int warp_num = blockDim.x >> 5;
-    int id = threadIdx.x;
-
-    int idx = blockIdx.x * blockDim.x + id;
-
-    float4* vals_cast = reinterpret_cast<float4*>(vals);
-
-    float4 data[128];
-
-    int bid = blockIdx.x;
-    int tid = threadIdx.x;
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed.first, idx, seed.second, &state);
-
-    int group_index = bid * token_size + threadIdx.x;
-    int reg_count = 0;
-    int total_count = token_size * token_num;
-    if (group_index < total_count) {
-        float min = 10000.0;
-        float max = -10000.0;
-
-        while (tid < token_size) {
-            float4 data_reg = vals_cast[group_index];
-            data[reg_count] = data_reg;
-            if (data_reg.x > max) max = data_reg.x;
-            if (data_reg.y > max) max = data_reg.y;
-            if (data_reg.w > max) max = data_reg.w;
-            if (data_reg.z > max) max = data_reg.z;
-
-            if (data_reg.x < min) min = data_reg.x;
-            if (data_reg.y < min) min = data_reg.y;
-            if (data_reg.w < min) min = data_reg.w;
-            if (data_reg.z < min) min = data_reg.z;
-
-            group_index += blockDim.x;
-            tid += blockDim.x;
-            reg_count++;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(max, i);
-            if (max < temp) max = temp;
-        }
-
-#pragma unroll
-        for (int i = 1; i < WARP_SIZE; i <<= 1) {
-            auto temp = g.shfl_xor(min, i);
-            if (min > temp) min = temp;
-        }
-
-        __shared__ float partialMax[WARP_SIZE];
-        __shared__ float partialMin[WARP_SIZE];
-
-        if (lane == 0) partialMax[gid] = max;
-        if (lane == 0) partialMin[gid] = min;
-
-        b.sync();
-
-        if (lane < warp_num) max = partialMax[lane];
-        if (lane < warp_num) min = partialMin[lane];
-
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(max, i);
-            if (max < temp) max = temp;
-        }
-#pragma unroll
-        for (int i = 1; i < warp_num; i <<= 1) {
-            auto temp = g.shfl_down(min, i);
-            if (min > temp) min = temp;
-        }
-
-        max = g.shfl(max, 0);
-        min = g.shfl(min, 0);
-
-        float q_scale_val = ((max - min) + 1e-5) / (float)(1 << num_bits);
-        float high_q = (float)((1 << num_bits) - 1);
-
-        int offset = (bid)*token_size;
-        for (int i = 0; i < reg_count; i++) {
-            group_index = i * blockDim.x + threadIdx.x;
-            if (group_index < token_size) {
-                float4 q_data = data[i];
-
-                float4 q_data_int;
-                q_data_int.x = (float)((int)((q_data.x - min) / q_scale_val));
-                q_data_int.y = (float)((int)((q_data.y - min) / q_scale_val));
-                q_data_int.w = (float)((int)((q_data.w - min) / q_scale_val));
-                q_data_int.z = (float)((int)((q_data.z - min) / q_scale_val));
-
-                // Stochastic rounding
-                float4 rand = curand_uniform4(&state);
-
-                float q_error[4];
-                q_error[0] = abs(q_data.x - ((q_data_int.x * q_scale_val) + min)) / q_scale_val;
-                q_error[1] = abs(q_data.y - ((q_data_int.y * q_scale_val) + min)) / q_scale_val;
-                q_error[2] = abs(q_data.w - ((q_data_int.w * q_scale_val) + min)) / q_scale_val;
-                q_error[3] = abs(q_data.z - ((q_data_int.z * q_scale_val) + min)) / q_scale_val;
-
-                q_data_int.x = (rand.x < q_error[0] && q_data_int.x < high_q) ? (q_data_int.x + 1)
-                                                                              : q_data_int.x;
-                q_data_int.y = (rand.y < q_error[1] && q_data_int.y < high_q) ? (q_data_int.y + 1)
-                                                                              : q_data_int.y;
-                q_data_int.w = (rand.w < q_error[2] && q_data_int.w < high_q) ? (q_data_int.w + 1)
-                                                                              : q_data_int.w;
-                q_data_int.z = (rand.z < q_error[3] && q_data_int.z < high_q) ? (q_data_int.z + 1)
-                                                                              : q_data_int.z;
-
-                q_data_int.x = q_data_int.x * q_scale_val + min;
-                q_data_int.y = q_data_int.y * q_scale_val + min;
-                q_data_int.w = q_data_int.w * q_scale_val + min;
-                q_data_int.z = q_data_int.z * q_scale_val + min;
-
-                vals_cast[group_index + offset] = q_data_int;
-            }
-        }
-    }
-}
-template <typename T>
-void launch_sr_quantize_kernel_asym(T* vals,
-                                    int total_count,
-                                    int group_num,
-                                    int num_bits,
-                                    cudaStream_t stream)
-{
-    dim3 block_dim(1024);
-    dim3 grid_dim(group_num);
-
-    uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
-
-    sr_quantize_kernel<<<grid_dim, block_dim, 0, stream>>>(
-        vals, (total_count / group_num) / 4, group_num, num_bits, seed);
-}
-template void launch_sr_quantize_kernel_asym(float* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             cudaStream_t stream);
-template void launch_sr_quantize_kernel_asym(__half* vals,
-                                             int total_count,
-                                             int group_num,
-                                             int num_bits,
-                                             cudaStream_t stream);
diff --git a/csrc/random_ltd/gather_scatter.cu b/csrc/random_ltd/gather_scatter.cu
index 8da74796ed1f626643f1ac534cc31915f3edcfad..291c8eb063b0e0a56843fb43d83caa5fdb00f6cb 100644
--- a/csrc/random_ltd/gather_scatter.cu
+++ b/csrc/random_ltd/gather_scatter.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 #include "memory_access_utils.h"
diff --git a/csrc/random_ltd/pt_binding.cpp b/csrc/random_ltd/pt_binding.cpp
index 54c41cab9851e11cd8ceaa7908c43dbddc69fb1d..b50476719872e80fe785cac2c17501b24e1a01e4 100644
--- a/csrc/random_ltd/pt_binding.cpp
+++ b/csrc/random_ltd/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <torch/extension.h>
 #include <vector>
diff --git a/csrc/random_ltd/slice_attn_masks.cu b/csrc/random_ltd/slice_attn_masks.cu
index 63d005c4b5087641a48a305332ebe265f50a5ebf..bc3823b846550cef1049e461ea35c35b53522146 100644
--- a/csrc/random_ltd/slice_attn_masks.cu
+++ b/csrc/random_ltd/slice_attn_masks.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 #include "memory_access_utils.h"
diff --git a/csrc/random_ltd/token_sort.cu b/csrc/random_ltd/token_sort.cu
index d260211f9739a5d875dcd0f56ba050f9438e20df..aae1b93e1c24744dea72ef5803c153826db4b499 100644
--- a/csrc/random_ltd/token_sort.cu
+++ b/csrc/random_ltd/token_sort.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "custom_cuda_layers.h"
diff --git a/csrc/sparse_attention/utils.cpp b/csrc/sparse_attention/utils.cpp
index 8e4346be8a299a09d38ce22adf1c2f80385620c1..352306ba26128b96236f43764c26d2f4191eb391 100644
--- a/csrc/sparse_attention/utils.cpp
+++ b/csrc/sparse_attention/utils.cpp
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/csrc/utils.cpp
+*/
 
 #include <torch/extension.h>
 #include <string>
diff --git a/csrc/spatial/csrc/opt_bias_add.cu b/csrc/spatial/csrc/opt_bias_add.cu
index dfcb92facbf7e5e1769a2d734cf6770ab36c5ede..d831b372b65f398b43c5cd343b2bf2db67562f56 100644
--- a/csrc/spatial/csrc/opt_bias_add.cu
+++ b/csrc/spatial/csrc/opt_bias_add.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <cassert>
 #include "memory_access_utils.h"
diff --git a/csrc/spatial/csrc/pt_binding.cpp b/csrc/spatial/csrc/pt_binding.cpp
index 3234b063c241ebc6b3448b4be4b62e689c8f8625..cbf6636a6ee4ac5769a88b7cab7d2a222f00d5cc 100644
--- a/csrc/spatial/csrc/pt_binding.cpp
+++ b/csrc/spatial/csrc/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
diff --git a/csrc/spatial/includes/spatial_cuda_layers.h b/csrc/spatial/includes/spatial_cuda_layers.h
index 9948153f56d659c1c5da0f4550ffdf057e826a70..bf6ac0cf18c2123c50a727993c16f3fc6028d4d3 100644
--- a/csrc/spatial/includes/spatial_cuda_layers.h
+++ b/csrc/spatial/includes/spatial_cuda_layers.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -8,11 +9,11 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #define HALF_PRECISION_AVAILABLE = 1
 #endif
 
-#ifdef __HIPCC__
+#ifdef __HIP_PLATFORM_HCC__
 #include <hip/hip_cooperative_groups.h>
 #else
 #include <cooperative_groups.h>
-#endif 
+#endif
 
 #include <cuda.h>
 #include <cuda_fp16.h>
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 97ecdd6f13ce7a5dedd7cecd14c5a23e90a22dea..b48b7086921a85e9ab005b706abd644268f7a07c 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "cublas_wrappers.h"
 
diff --git a/csrc/transformer/dropout_kernels.cu b/csrc/transformer/dropout_kernels.cu
old mode 100755
new mode 100644
index b2f2abc3f09875a51662292f8e195c53da3e2fa3..97cb3592dcdc4cf0a2dc37e34fc9b13117d9bcc5
--- a/csrc/transformer/dropout_kernels.cu
+++ b/csrc/transformer/dropout_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
@@ -277,7 +278,7 @@ void launch_dropout(T* out,
         grid_dim.x <<= 1;
     }
     uint64_t inc = total_count / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
     if (bwd)
         dropout_kernel_bwd<<<grid_dim, block_dim, 0, stream>>>(
             total_count, ratio, vals, out, mask, seed);
@@ -624,7 +625,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, bias, out, mask, seed);
@@ -846,7 +847,7 @@ void launch_dropout(T* out,
     dim3 block_dim = DS_CUDA_NUM_THREADS;
 
     uint64_t inc = (batch * dim) / grid_dim.x / block_dim.x;
-    std::pair<uint64_t, uint64_t> seed = Context::Instance().IncrementOffset(inc);
+    std::pair<uint64_t, uint64_t> seed = TrainingContext::Instance().IncrementOffset(inc);
 
     dropout_kernel<<<grid_dim, block_dim, 0, stream>>>(
         total_count, dim, ratio, input, residual, bias, out, mask, seed);
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index d2a0f6e9d82a7085d22307439821c9f7e0329bf1..fe605ad76a8f694a8c570354225d7750e43bc1f5 100644
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -1,3 +1,8 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 #include <torch/extension.h>
 
 #include <cublas_v2.h>
@@ -73,8 +78,8 @@ BertTransformerLayer<T>::BertTransformerLayer(unsigned layer_id,
       _normalize_invertible(normalize_invertible),
       _gelu_checkpoint(gelu_checkpoint),
       _stochastic_mode(stochastic_mode),
-      _stream(Context::Instance().GetCurrentStream()),
-      _cublasHandle(Context::Instance().GetCublasHandle()),
+      _stream(TrainingContext::Instance().GetCurrentStream()),
+      _cublasHandle(TrainingContext::Instance().GetCublasHandle()),
       _qkv_linear(typename FeedForward<T>::Config(batch_size * seq_length,
                                                   3 * hidden_size,
                                                   hidden_size,
@@ -179,7 +184,7 @@ void BertTransformerLayer<T>::Forward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -339,7 +344,7 @@ void BertTransformerLayer<T>::Backward(unsigned bsz,
 
     if (!_stochastic_mode) cudaStreamSynchronize(_stream);
 
-    T* workspace = static_cast<T*>(Context::Instance().GetWorkSpace());
+    T* workspace = static_cast<T*>(TrainingContext::Instance().GetWorkSpace());
     size_t small_buf_size = bsz * _seq_length * _hidden_size;
     T* buf_0 = workspace;
     T* buf_1 = buf_0 + small_buf_size;
@@ -605,25 +610,26 @@ int create_transformer_layer(unsigned layer_id,
                              bool gelu_checkpoint,
                              bool stochastic_mode)
 {
-    Context::Instance().SetSeed(seed);
-    Context::Instance().TestGemmFP16(
+    TrainingContext::Instance().SetSeed(seed);
+    TrainingContext::Instance().TestGemmFP16(
         test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
-    auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
-                                                           batch_size,
-                                                           hidden_dim,
-                                                           num_heads,
-                                                           intermediate_size,
-                                                           init_seq_length,
-                                                           attn_dropout_ratio,
-                                                           hidden_dropout_ratio,
-                                                           layer_norm_eps,
-                                                           pre_or_postLayerNorm,
-                                                           Context::Instance().GetGemmAlgos(),
-                                                           attn_dropout_checkpoint,
-                                                           normalize_invertible,
-                                                           gelu_checkpoint,
-                                                           stochastic_mode);
+    auto layer =
+        std::make_shared<BertTransformerLayer<T>>(layer_id,
+                                                  batch_size,
+                                                  hidden_dim,
+                                                  num_heads,
+                                                  intermediate_size,
+                                                  init_seq_length,
+                                                  attn_dropout_ratio,
+                                                  hidden_dropout_ratio,
+                                                  layer_norm_eps,
+                                                  pre_or_postLayerNorm,
+                                                  TrainingContext::Instance().GetGemmAlgos(),
+                                                  attn_dropout_checkpoint,
+                                                  normalize_invertible,
+                                                  gelu_checkpoint,
+                                                  stochastic_mode);
 
     s_transformer_layers[layer_id] = layer;
 
@@ -721,7 +727,7 @@ std::vector<torch::Tensor> ds_transformer_forward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto inp_norm = ((prelayernorm || !normalize_invertible) ? torch::empty_like(input) : output);
     auto add_res = (normalize_invertible ? inp_norm : torch::empty_like(input));
@@ -905,7 +911,7 @@ std::vector<torch::Tensor> ds_transformer_backward(unsigned layer_id,
                                                          layer->IsTrainingMode(),
                                                          layer->GeluCheckpoint())},
                                   options);
-    Context::Instance().SetWorkSpace((T*)workspace.data_ptr());
+    TrainingContext::Instance().SetWorkSpace((T*)workspace.data_ptr());
 
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
diff --git a/csrc/transformer/gelu_kernels.cu b/csrc/transformer/gelu_kernels.cu
index 1f113a988d74e3d772d8268cbd7882ba087dba03..273891b91923056fe3468600827f538c9cee5009 100644
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/transformer/general_kernels.cu b/csrc/transformer/general_kernels.cu
index ea5491003dc95d8cfcaa80804fdd8ce23b9303ff..a987eec5ef0bdc87153422eae635507f44188802 100644
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "general_kernels.h"
 
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
index 0be46353c6543a4a87c3657fef4ef60322f91481..3baa88c72f84a980d79f19c8bd558333f1df9afd 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
@@ -1,7 +1,9 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
+// DeepSpeed Team
+
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
@@ -11,8 +13,9 @@ Copyright 2022 The Microsoft DeepSpeed Team
 namespace cg = cooperative_groups;
 namespace cg = cooperative_groups;
 
-__global__ void apply_rotary_pos_emb(float* mixed_query,
-                                     float* key_layer,
+template <typename T>
+__global__ void apply_rotary_pos_emb(T* mixed_query,
+                                     T* key_layer,
                                      unsigned rotary_dim,
                                      unsigned seq_len,
                                      unsigned seq_offset,
@@ -39,8 +42,8 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = mixed_query[offset + lane];
-            float k = key_layer[k_offset + lane];
+            float q = conversion::to<float>(mixed_query[offset + lane]);
+            float k = conversion::to<float>(key_layer[k_offset + lane]);
             float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -49,59 +52,14 @@ __global__ void apply_rotary_pos_emb(float* mixed_query,
             q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
             k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
 
-            mixed_query[offset + lane] = q;
-            key_layer[k_offset + lane] = k;
+            mixed_query[offset + lane] = conversion::to<T>(q);
+            key_layer[k_offset + lane] = conversion::to<T>(k);
 
             lane += WARP_SIZE;
         }
     }
 }
 
-__global__ void apply_rotary_pos_emb(__half* mixed_query,
-                                     __half* key_layer,
-                                     unsigned rotary_dim,
-                                     unsigned seq_len,
-                                     unsigned seq_offset,
-                                     unsigned num_heads,
-                                     unsigned head_size,
-                                     unsigned total_count,
-                                     int max_out_tokens)
-{
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int lane = id & 0x1f;
-
-    unsigned head_id = blockIdx.x * MAX_WARP_NUM + gid;
-    unsigned offset = head_id * head_size;
-
-    unsigned seq_id = (head_id / num_heads) % seq_len + seq_offset;
-    unsigned seq_index = head_id % seq_len;
-    unsigned k_offset = (seq_index + (head_id / seq_len) * max_out_tokens) * head_size;
-
-    if (head_id < total_count) {
-        while (lane < rotary_dim) {
-            float inv_freq = (float)((lane / 2) * 2) / (float)rotary_dim;
-            inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[k_offset + lane];
-            float rotary_sign = (lane % 2 == 1 ? -1.0 : 1.0);
-            float q_rot = (q * rotary_sign);
-            float k_rot = (k * rotary_sign);
-            q_rot = g.shfl_xor(q_rot, 1);
-            k_rot = g.shfl_xor(k_rot, 1);
-            q = q * cosf(inv_freq) + q_rot * sinf(inv_freq);
-            k = k * cosf(inv_freq) + k_rot * sinf(inv_freq);
-
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[k_offset + lane] = (__half)k;
-
-            lane += WARP_SIZE;
-        }
-    }
-}
 __global__ void apply_rotary_pos_emb1(float* mixed_query,
                                       float* key_layer,
                                       unsigned rotary_dim,
@@ -147,8 +105,10 @@ __global__ void apply_rotary_pos_emb1(float* mixed_query,
         }
     }
 }
-__global__ void apply_rotary_pos_emb1(__half* mixed_query,
-                                      __half* key_layer,
+
+template <typename T>
+__global__ void apply_rotary_pos_emb1(T* mixed_query,
+                                      T* key_layer,
                                       unsigned rotary_dim,
                                       unsigned seq_len,
                                       unsigned seq_offset,
@@ -184,8 +144,8 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
         while (lane < rotary_dim) {
             float inv_freq = (float)((lane % half_dim) * 2) / (float)rotary_dim;
             inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
-            float q = (float)mixed_query[offset + lane];
-            float k = (float)key_layer[k_offset + lane];
+            float q = conversion::to<float>(mixed_query[offset + lane]);
+            float k = conversion::to<float>(key_layer[k_offset + lane]);
             float rotary_sign = (lane > (half_dim - 1) ? -1.0 : 1.0);
             float q_rot = (q * rotary_sign);
             float k_rot = (k * rotary_sign);
@@ -196,8 +156,8 @@ __global__ void apply_rotary_pos_emb1(__half* mixed_query,
             q = q * cosf(inv_freq) + q_rot_tmp * sinf(inv_freq);
             k = k * cosf(inv_freq) + k_rot_tmp * sinf(inv_freq);
 
-            mixed_query[offset + lane] = (__half)q;
-            key_layer[k_offset + lane] = (__half)k;
+            mixed_query[offset + lane] = conversion::to<T>(q);
+            key_layer[k_offset + lane] = conversion::to<T>(k);
 
             lane += WARP_SIZE;
         }
@@ -255,6 +215,20 @@ template void launch_apply_rotary_pos_emb<float>(float*,
                                                  bool,
                                                  cudaStream_t,
                                                  int);
+#ifdef BF16_AVAILABLE
+template void launch_apply_rotary_pos_emb<__nv_bfloat16>(__nv_bfloat16*,
+                                                         __nv_bfloat16*,
+                                                         unsigned,
+                                                         unsigned,
+                                                         unsigned,
+                                                         unsigned,
+                                                         unsigned,
+                                                         unsigned,
+                                                         bool,
+                                                         bool,
+                                                         cudaStream_t,
+                                                         int);
+#endif
 template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   __half*,
                                                   unsigned,
@@ -268,6 +242,59 @@ template void launch_apply_rotary_pos_emb<__half>(__half*,
                                                   cudaStream_t,
                                                   int);
 
+template __global__ void apply_rotary_pos_emb(float* mixed_query,
+                                              float* key_layer,
+                                              unsigned rotary_dim,
+                                              unsigned seq_len,
+                                              unsigned seq_offset,
+                                              unsigned num_heads,
+                                              unsigned head_size,
+                                              unsigned total_count,
+                                              int max_out_tokens);
+
+#ifdef BF16_AVAILABLE
+template __global__ void apply_rotary_pos_emb(__nv_bfloat16* mixed_query,
+                                              __nv_bfloat16* key_layer,
+                                              unsigned rotary_dim,
+                                              unsigned seq_len,
+                                              unsigned seq_offset,
+                                              unsigned num_heads,
+                                              unsigned head_size,
+                                              unsigned total_count,
+                                              int max_out_tokens);
+#endif
+
+template __global__ void apply_rotary_pos_emb(__half* mixed_query,
+                                              __half* key_layer,
+                                              unsigned rotary_dim,
+                                              unsigned seq_len,
+                                              unsigned seq_offset,
+                                              unsigned num_heads,
+                                              unsigned head_size,
+                                              unsigned total_count,
+                                              int max_out_tokens);
+
+#ifdef BF16_AVAILABLE
+template __global__ void apply_rotary_pos_emb1(__nv_bfloat16* mixed_query,
+                                               __nv_bfloat16* key_layer,
+                                               unsigned rotary_dim,
+                                               unsigned seq_len,
+                                               unsigned seq_offset,
+                                               unsigned num_heads,
+                                               unsigned head_size,
+                                               unsigned total_count,
+                                               int max_out_tokens);
+#endif
+
+template __global__ void apply_rotary_pos_emb1(__half* mixed_query,
+                                               __half* key_layer,
+                                               unsigned rotary_dim,
+                                               unsigned seq_len,
+                                               unsigned seq_offset,
+                                               unsigned num_heads,
+                                               unsigned head_size,
+                                               unsigned total_count,
+                                               int max_out_tokens);
 /*
 __global__ void apply_rotary_pos_emb(float* mixed_query,
 float* key_layer,
diff --git a/csrc/transformer/inference/csrc/dequantize.cu b/csrc/transformer/inference/csrc/dequantize.cu
index 33605e1f54e0d09e7948d172c6539f60d8424016..fe7b665f0efb5c82aa666040062acf6e5b916088 100644
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@@ -1,7 +1,9 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
 
+// DeepSpeed Team
+
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 
 #define MAX_QUANTIZE_GROUPING 1024
@@ -9,7 +11,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #define loop_unroll 1
 #define loop_unroll_bits 1
 
-__global__ void dequantize_kernel(float* output,
+template <typename T>
+__global__ void dequantize_kernel(T* output,
                                   const int8_t* input,
                                   const float* qscale,
                                   int output_size,
@@ -37,40 +40,7 @@ __global__ void dequantize_kernel(float* output,
 
         float scale_data = qscale[scale_index];
 
-        output[q_index] = (scale_data * (float)q);
-        tid += blockDim.x;
-    }
-}
-
-__global__ void dequantize_kernel(__half* output,
-                                  const int8_t* input,
-                                  const float* qscale,
-                                  unsigned output_size,
-                                  unsigned hidden_dim,
-                                  unsigned groups,
-                                  unsigned merge_count)
-{
-    unsigned merge_hidden = hidden_dim >> merge_count;
-    unsigned quantization_stride = (merge_hidden * output_size) / groups;
-
-    unsigned bid = blockIdx.x;
-    unsigned tid = threadIdx.x;
-
-    while (tid < output_size) {
-        unsigned w_index = bid / merge_hidden;
-        unsigned q_index = tid + bid * output_size;
-
-        auto q = input[q_index];
-
-        unsigned merge_hidden_total = w_index * merge_hidden;
-        unsigned scale_index =
-            ((((bid - merge_hidden_total) + tid * merge_hidden) / quantization_stride)
-             << merge_count) +
-            w_index;
-
-        float scale_data = qscale[scale_index];
-
-        output[q_index] = __float2half(scale_data * (float)q);
+        output[q_index] = conversion::to<T>(scale_data * (float)q);
         tid += blockDim.x;
     }
 }
@@ -101,6 +71,18 @@ template void launch_dequantize<float>(float*,
                                        unsigned,
                                        unsigned,
                                        cudaStream_t);
+
+#ifdef BF16_AVAILABLE
+template void launch_dequantize<__nv_bfloat16>(__nv_bfloat16*,
+                                               const int8_t*,
+                                               const float*,
+                                               unsigned,
+                                               unsigned,
+                                               unsigned,
+                                               unsigned,
+                                               cudaStream_t);
+#endif
+
 template void launch_dequantize<__half>(__half*,
                                         const int8_t*,
                                         const float*,
@@ -119,7 +101,8 @@ __global__ void dequantize_kernel(float* output,
 {
 }
 
-__global__ void dequantize_kernel(__half* output,
+template <typename T>
+__global__ void dequantize_kernel(T* output,
                                   const int8_t* input,
                                   const float* qscale,
                                   unsigned hidden_dim,
@@ -143,12 +126,12 @@ __global__ void dequantize_kernel(__half* output,
             int8_t* q_int8 = (int8_t*)&q;
 
             float2 q_f;
-            __half* q_h = (__half*)&q_f;
+            T* q_h = (T*)&q_f;
 
-            q_h[0] = __float2half(local_scale * (float)q_int8[0]);
-            q_h[1] = __float2half(local_scale * (float)q_int8[1]);
-            q_h[2] = __float2half(local_scale * (float)q_int8[2]);
-            q_h[3] = __float2half(local_scale * (float)q_int8[3]);
+            q_h[0] = conversion::to<T>(local_scale * (float)q_int8[0]);
+            q_h[1] = conversion::to<T>(local_scale * (float)q_int8[1]);
+            q_h[2] = conversion::to<T>(local_scale * (float)q_int8[2]);
+            q_h[3] = conversion::to<T>(local_scale * (float)q_int8[3]);
             output_cast[tid] = q_f;
             tid += blockDim.x;
         }
@@ -185,6 +168,17 @@ template void launch_dequantize<float>(float*,
                                        unsigned,
                                        unsigned,
                                        cudaStream_t);
+
+#ifdef BF16_AVAILABLE
+template void launch_dequantize<__nv_bfloat16>(__nv_bfloat16*,
+                                               const int8_t*,
+                                               const float*,
+                                               unsigned,
+                                               unsigned,
+                                               unsigned,
+                                               cudaStream_t);
+#endif
+
 template void launch_dequantize<__half>(__half*,
                                         const int8_t*,
                                         const float*,
@@ -192,3 +186,45 @@ template void launch_dequantize<__half>(__half*,
                                         unsigned,
                                         unsigned,
                                         cudaStream_t);
+
+template __global__ void dequantize_kernel(float* output,
+                                           const int8_t* input,
+                                           const float* qscale,
+                                           int output_size,
+                                           int hidden_dim,
+                                           int groups,
+                                           int merge_count);
+
+#ifdef BF16_AVAILABLE
+template __global__ void dequantize_kernel(__nv_bfloat16* output,
+                                           const int8_t* input,
+                                           const float* qscale,
+                                           int output_size,
+                                           int hidden_dim,
+                                           int groups,
+                                           int merge_count);
+#endif
+
+template __global__ void dequantize_kernel(__half* output,
+                                           const int8_t* input,
+                                           const float* qscale,
+                                           int output_size,
+                                           int hidden_dim,
+                                           int groups,
+                                           int merge_count);
+
+#ifdef BF16_AVAILABLE
+template __global__ void dequantize_kernel(__nv_bfloat16* output,
+                                           const int8_t* input,
+                                           const float* qscale,
+                                           unsigned hidden_dim,
+                                           unsigned merge_hidden,
+                                           int cnt);
+#endif
+
+template __global__ void dequantize_kernel(__half* output,
+                                           const int8_t* input,
+                                           const float* qscale,
+                                           unsigned hidden_dim,
+                                           unsigned merge_hidden,
+                                           int cnt);
diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index 71a37bb368c798df32515d8df6baddc3cd6c415b..26b6de0cdc53bbd2c2bb70aa5cd3adfa43ea4589 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "inference_cuda_layers.h"
@@ -10,6 +11,11 @@ namespace cg = cooperative_groups;
 #define MAX_CAP 4
 #define MAX_SEQ 2048
 
+// only used to avoid compilation error due to lack of definition.
+#ifndef BF16_AVAILABLE
+using __nv_bfloat162 = __half2;
+#endif
+
 inline __device__ float gelu(const float x)
 {
     const float sqrt_param = 0.79788456080286535587989211986876f;
@@ -65,6 +71,13 @@ void launch_bias_gelu(T* input,
 }
 
 template void launch_bias_gelu<float>(float*, const float*, int, int, cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_bias_gelu<__nv_bfloat16>(__nv_bfloat16*,
+                                              const __nv_bfloat16*,
+                                              int,
+                                              int,
+                                              cudaStream_t);
+#endif
 template void launch_bias_gelu<__half>(__half*, const __half*, int, int, cudaStream_t);
 
 /*
@@ -115,6 +128,13 @@ void launch_bias_add(T* input,
 }
 
 template void launch_bias_add<float>(float*, const float*, int, int, cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_bias_add<__nv_bfloat16>(__nv_bfloat16*,
+                                             const __nv_bfloat16*,
+                                             int,
+                                             int,
+                                             cudaStream_t);
+#endif
 template void launch_bias_add<__half>(__half*, const __half*, int, int, cudaStream_t);
 
 __global__ void fused_bias_residual(float* residual,
@@ -162,16 +182,19 @@ __global__ void fused_bias_residual(float* residual,
     }
 }
 
-__global__ void fused_bias_residual(__half* residual,
-                                    const __half* hidden_state,
-                                    const __half* attn,
-                                    const __half* bias,
-                                    const __half* attn_bias,
+template <typename T>
+__global__ void fused_bias_residual(T* residual,
+                                    const T* hidden_state,
+                                    const T* attn,
+                                    const T* bias,
+                                    const T* attn_bias,
                                     const int total_count,
                                     const int intermediate_size,
                                     const float mp_scale,
                                     const bool preln)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
     const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
     const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
@@ -186,26 +209,26 @@ __global__ void fused_bias_residual(__half* residual,
         const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
         const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
 
-        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
-        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
-        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
-        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
-        const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
+        T2* res_half2 = reinterpret_cast<T2*>(&res_fl2);
+        const T2* hs_half2 = reinterpret_cast<const T2*>(&hs_fl2);
+        const T2* attn_half2 = reinterpret_cast<const T2*>(&attn_fl2);
+        const T2* bias_half2 = reinterpret_cast<const T2*>(&bias_fl2);
+        const T2* attn_bias_half2 = reinterpret_cast<const T2*>(&attn_bias_fl2);
 
-        float2 res_low = __half22float2(res_half2[0]);
-        float2 res_high = __half22float2(res_half2[1]);
+        float2 res_low = conversion::to<float2>(res_half2[0]);
+        float2 res_high = conversion::to<float2>(res_half2[1]);
 
-        const float2 hs_low = __half22float2(hs_half2[0]);
-        const float2 hs_high = __half22float2(hs_half2[1]);
+        const float2 hs_low = conversion::to<float2>(hs_half2[0]);
+        const float2 hs_high = conversion::to<float2>(hs_half2[1]);
 
-        const float2 attn_low = __half22float2(attn_half2[0]);
-        const float2 attn_high = __half22float2(attn_half2[1]);
+        const float2 attn_low = conversion::to<float2>(attn_half2[0]);
+        const float2 attn_high = conversion::to<float2>(attn_half2[1]);
 
-        const float2 bias_low = __half22float2(bias_half2[0]);
-        const float2 bias_high = __half22float2(bias_half2[1]);
+        const float2 bias_low = conversion::to<float2>(bias_half2[0]);
+        const float2 bias_high = conversion::to<float2>(bias_half2[1]);
 
-        const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
-        const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+        const float2 attn_bias_low = conversion::to<float2>(attn_bias_half2[0]);
+        const float2 attn_bias_high = conversion::to<float2>(attn_bias_half2[1]);
 
         if (preln) {
             // residual = (residual + attention + bias + attention_bias) *
@@ -225,8 +248,8 @@ __global__ void fused_bias_residual(__half* residual,
             res_high.x = (res_high.x + hs_high.x + bias_high.x);
             res_high.y = (res_high.y + hs_high.y + bias_high.y);
         }
-        res_half2[0] = __float22half2_rn(res_low);
-        res_half2[1] = __float22half2_rn(res_high);
+        res_half2[0] = conversion::to<T2>(res_low);
+        res_half2[1] = conversion::to<T2>(res_high);
 
         res_fl2_ptr[offset] = res_fl2;
     }
@@ -261,9 +284,43 @@ void launch_bias_residual(T* residual,
 
 template void launch_bias_residual<
     float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_bias_residual<__nv_bfloat16>(__nv_bfloat16*,
+                                                  __nv_bfloat16*,
+                                                  __nv_bfloat16*,
+                                                  __nv_bfloat16*,
+                                                  __nv_bfloat16*,
+                                                  int,
+                                                  int,
+                                                  int,
+                                                  bool,
+                                                  cudaStream_t);
+#endif
 template void launch_bias_residual<
     __half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);
 
+#ifdef BF16_AVAILABLE
+template __global__ void fused_bias_residual(__nv_bfloat16* residual,
+                                             const __nv_bfloat16* hidden_state,
+                                             const __nv_bfloat16* attn,
+                                             const __nv_bfloat16* bias,
+                                             const __nv_bfloat16* attn_bias,
+                                             const int total_count,
+                                             const int intermediate_size,
+                                             const float mp_scale,
+                                             const bool preln);
+#endif
+
+template __global__ void fused_bias_residual(__half* residual,
+                                             const __half* hidden_state,
+                                             const __half* attn,
+                                             const __half* bias,
+                                             const __half* attn_bias,
+                                             const int total_count,
+                                             const int intermediate_size,
+                                             const float mp_scale,
+                                             const bool preln);
+
 __global__ void gptj_residual_add(float* residual,
                                   const float* hidden_state,
                                   const float* attn,
@@ -304,15 +361,18 @@ __global__ void gptj_residual_add(float* residual,
     }
 }
 
-__global__ void gptj_residual_add(__half* residual,
-                                  const __half* hidden_state,
-                                  const __half* attn,
-                                  const __half* bias,
-                                  const __half* attn_bias,
+template <typename T>
+__global__ void gptj_residual_add(T* residual,
+                                  const T* hidden_state,
+                                  const T* attn,
+                                  const T* bias,
+                                  const T* attn_bias,
                                   const int total_count,
                                   const int intermediate_size,
                                   const float mp_scale)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float2* res_fl2_ptr = reinterpret_cast<float2*>(residual);
     const float2* hs_fl2_ptr = reinterpret_cast<const float2*>(hidden_state);
     const float2* attn_fl2_ptr = reinterpret_cast<const float2*>(attn);
@@ -326,28 +386,28 @@ __global__ void gptj_residual_add(__half* residual,
         const float2 attn_fl2 = attn_fl2_ptr[offset];
         const float2 bias_fl2 = bias_fl2_ptr[offset % intermediate_size];
 
-        __half2* res_half2 = reinterpret_cast<__half2*>(&res_fl2);
-        const __half2* hs_half2 = reinterpret_cast<const __half2*>(&hs_fl2);
-        const __half2* attn_half2 = reinterpret_cast<const __half2*>(&attn_fl2);
-        const __half2* bias_half2 = reinterpret_cast<const __half2*>(&bias_fl2);
+        T2* res_half2 = reinterpret_cast<T2*>(&res_fl2);
+        const T2* hs_half2 = reinterpret_cast<const T2*>(&hs_fl2);
+        const T2* attn_half2 = reinterpret_cast<const T2*>(&attn_fl2);
+        const T2* bias_half2 = reinterpret_cast<const T2*>(&bias_fl2);
 
-        float2 res_low = __half22float2(res_half2[0]);
-        float2 res_high = __half22float2(res_half2[1]);
+        float2 res_low = conversion::to<float2>(res_half2[0]);
+        float2 res_high = conversion::to<float2>(res_half2[1]);
 
-        const float2 hs_low = __half22float2(hs_half2[0]);
-        const float2 hs_high = __half22float2(hs_half2[1]);
+        const float2 hs_low = conversion::to<float2>(hs_half2[0]);
+        const float2 hs_high = conversion::to<float2>(hs_half2[1]);
 
-        const float2 attn_low = __half22float2(attn_half2[0]);
-        const float2 attn_high = __half22float2(attn_half2[1]);
+        const float2 attn_low = conversion::to<float2>(attn_half2[0]);
+        const float2 attn_high = conversion::to<float2>(attn_half2[1]);
 
-        const float2 bias_low = __half22float2(bias_half2[0]);
-        const float2 bias_high = __half22float2(bias_half2[1]);
+        const float2 bias_low = conversion::to<float2>(bias_half2[0]);
+        const float2 bias_high = conversion::to<float2>(bias_half2[1]);
 
         if (attn_bias) {
             const float2 attn_bias_fl2 = attn_bias_fl2_ptr[offset % intermediate_size];
-            const __half2* attn_bias_half2 = reinterpret_cast<const __half2*>(&attn_bias_fl2);
-            const float2 attn_bias_low = __half22float2(attn_bias_half2[0]);
-            const float2 attn_bias_high = __half22float2(attn_bias_half2[1]);
+            const T2* attn_bias_half2 = reinterpret_cast<const T2*>(&attn_bias_fl2);
+            const float2 attn_bias_low = conversion::to<float2>(attn_bias_half2[0]);
+            const float2 attn_bias_high = conversion::to<float2>(attn_bias_half2[1]);
             // residual += attention_bias
             res_low.x += attn_bias_low.x;
             res_low.y += attn_bias_low.y;
@@ -360,8 +420,8 @@ __global__ void gptj_residual_add(__half* residual,
         res_high.x = attn_high.x + hs_high.x + (res_high.x + bias_high.x) * mp_scale;
         res_high.y = attn_high.y + hs_high.y + (res_high.y + bias_high.y) * mp_scale;
 
-        res_half2[0] = __float22half2_rn(res_low);
-        res_half2[1] = __float22half2_rn(res_high);
+        res_half2[0] = conversion::to<T2>(res_low);
+        res_half2[1] = conversion::to<T2>(res_high);
 
         res_fl2_ptr[offset] = res_fl2;
     }
@@ -395,6 +455,19 @@ template void launch_gptj_residual_add<float>(float*,
                                               int,
                                               int,
                                               cudaStream_t);
+
+#ifdef BF16_AVAILABLE
+template void launch_gptj_residual_add<__nv_bfloat16>(__nv_bfloat16*,
+                                                      __nv_bfloat16*,
+                                                      __nv_bfloat16*,
+                                                      __nv_bfloat16*,
+                                                      __nv_bfloat16*,
+                                                      int,
+                                                      int,
+                                                      int,
+                                                      cudaStream_t);
+#endif
+
 template void launch_gptj_residual_add<__half>(__half*,
                                                __half*,
                                                __half*,
@@ -404,6 +477,27 @@ template void launch_gptj_residual_add<__half>(__half*,
                                                int,
                                                int,
                                                cudaStream_t);
+
+#ifdef BF16_AVAILABLE
+template __global__ void gptj_residual_add(__nv_bfloat16* residual,
+                                           const __nv_bfloat16* hidden_state,
+                                           const __nv_bfloat16* attn,
+                                           const __nv_bfloat16* bias,
+                                           const __nv_bfloat16* attn_bias,
+                                           const int total_count,
+                                           const int intermediate_size,
+                                           const float mp_scale);
+#endif
+
+template __global__ void gptj_residual_add(__half* residual,
+                                           const __half* hidden_state,
+                                           const __half* attn,
+                                           const __half* bias,
+                                           const __half* attn_bias,
+                                           const int total_count,
+                                           const int intermediate_size,
+                                           const float mp_scale);
+
 template <typename T>
 __global__ void moe_res_matmul(T* residual, T* coef, T* mlp_out, int seq_len, int hidden_dim)
 {
@@ -454,6 +548,16 @@ template void launch_moe_res_matmul(float* residual,
                                     int seq_len,
                                     int hidden_dim,
                                     cudaStream_t stream);
+
+#ifdef BF16_AVAILABLE
+template void launch_moe_res_matmul(__nv_bfloat16* residual,
+                                    __nv_bfloat16* coef,
+                                    __nv_bfloat16* mlp_out,
+                                    int seq_len,
+                                    int hidden_dim,
+                                    cudaStream_t stream);
+#endif
+
 template void launch_moe_res_matmul(__half* residual,
                                     __half* coef,
                                     __half* mlp_out,
@@ -461,11 +565,11 @@ template void launch_moe_res_matmul(__half* residual,
                                     int hidden_dim,
                                     cudaStream_t stream);
 
-__global__ void pad_data_kernel(__half* padded_output,
-                                __half* output,
-                                int head_size,
-                                int padded_head_size)
+template <typename T>
+__global__ void pad_data_kernel(T* padded_output, T* output, int head_size, int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
     float4* output_cast = reinterpret_cast<float4*>(output);
     int bid = blockIdx.x * (blockDim.y) + threadIdx.y;
@@ -473,8 +577,8 @@ __global__ void pad_data_kernel(__half* padded_output,
     padded_output_cast += (bid * padded_head_size);
     output_cast += (bid * head_size);
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
     if (idx < head_size)
@@ -482,12 +586,14 @@ __global__ void pad_data_kernel(__half* padded_output,
     else
         padded_output_cast[idx] = ZERO;
 }
+
 __global__ void pad_data_kernel(float* padded_output,
                                 float* output,
                                 int head_size,
                                 int padded_head_size)
 {
 }
+
 template <typename T>
 void pad_data(T* padded_output,
               T* output,
@@ -507,6 +613,16 @@ template void pad_data(__half* padded_output,
                        int head_size,
                        int padded_head_size,
                        cudaStream_t stream);
+
+#ifdef BF16_AVAILABLE
+template void pad_data(__nv_bfloat16* padded_output,
+                       __nv_bfloat16* output,
+                       int bsz,
+                       int head_size,
+                       int padded_head_size,
+                       cudaStream_t stream);
+#endif
+
 template void pad_data(float* padded_output,
                        float* output,
                        int bsz,
@@ -514,13 +630,28 @@ template void pad_data(float* padded_output,
                        int padded_head_size,
                        cudaStream_t stream);
 
-__global__ void pad_head_seq_kernel(__half* padded_output,
-                                    __half* output,
+#ifdef BF16_AVAILABLE
+template __global__ void pad_data_kernel(__nv_bfloat16* padded_output,
+                                         __nv_bfloat16* output,
+                                         int head_size,
+                                         int padded_head_size);
+#endif
+
+template __global__ void pad_data_kernel(__half* padded_output,
+                                         __half* output,
+                                         int head_size,
+                                         int padded_head_size);
+
+template <typename T>
+__global__ void pad_head_seq_kernel(T* padded_output,
+                                    T* output,
                                     int seq_len,
                                     int padded_seq_len,
                                     int head_size,
                                     int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4* padded_output_cast = reinterpret_cast<float4*>(padded_output);
     float4* output_cast = reinterpret_cast<float4*>(output);
     int bsz = blockIdx.x;
@@ -529,8 +660,8 @@ __global__ void pad_head_seq_kernel(__half* padded_output,
     padded_output_cast += (bsz * padded_seq_len + bid) * padded_head_size;
     output_cast += (bsz * seq_len + bid) * head_size;
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
@@ -539,6 +670,7 @@ __global__ void pad_head_seq_kernel(__half* padded_output,
     else
         padded_output_cast[idx] = ZERO;
 }
+
 __global__ void pad_head_seq_kernel(float* padded_output,
                                     float* output,
                                     int seq_len,
@@ -547,6 +679,7 @@ __global__ void pad_head_seq_kernel(float* padded_output,
                                     int padded_head_size)
 {
 }
+
 template <typename T>
 void pad_head_seq(T* padded_output,
                   T* output,
@@ -562,6 +695,7 @@ void pad_head_seq(T* padded_output,
     pad_head_seq_kernel<<<grid_dim, block_dim, 0, stream>>>(
         padded_output, output, seq_len, padded_seq_len, head_size / 8, padded_head_size / 8);
 }
+
 template void pad_head_seq(__half* padded_output,
                            __half* output,
                            int bsz,
@@ -570,6 +704,18 @@ template void pad_head_seq(__half* padded_output,
                            int head_size,
                            int padded_head_size,
                            cudaStream_t stream);
+
+#ifdef BF16_AVAILABLE
+template void pad_head_seq(__nv_bfloat16* padded_output,
+                           __nv_bfloat16* output,
+                           int bsz,
+                           int seq_len,
+                           int padded_seq_len,
+                           int head_size,
+                           int padded_head_size,
+                           cudaStream_t stream);
+#endif
+
 template void pad_head_seq(float* padded_output,
                            float* output,
                            int bsz,
@@ -680,4 +826,12 @@ template void launch_fused_bias_geglu(__half*,
                                       int,
                                       int,
                                       cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_fused_bias_geglu(__nv_bfloat16*,
+                                      const __nv_bfloat16*,
+                                      const __nv_bfloat16*,
+                                      int,
+                                      int,
+                                      cudaStream_t);
+#endif
 template void launch_fused_bias_geglu(float*, const float*, const float*, int, int, cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/layer_norm.cu b/csrc/transformer/inference/csrc/layer_norm.cu
index 06078512cef5c808606144a36bfed7b41acc49bf..807d60e740df727c0fc656aa83f181b914b691a9 100644
--- a/csrc/transformer/inference/csrc/layer_norm.cu
+++ b/csrc/transformer/inference/csrc/layer_norm.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"
@@ -196,6 +197,16 @@ template void launch_fused_ln(__half*,
                               int,
                               int,
                               cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_fused_ln(__nv_bfloat16*,
+                              const __nv_bfloat16*,
+                              const __nv_bfloat16*,
+                              const __nv_bfloat16*,
+                              float,
+                              int,
+                              int,
+                              cudaStream_t);
+#endif
 template void
 launch_fused_ln(float*, const float*, const float*, const float*, float, int, int, cudaStream_t);
 
@@ -492,6 +503,19 @@ template void launch_fused_residual_ln(__half*,
                                        int,
                                        cudaStream_t);
 
+#ifdef BF16_AVAILABLE
+template void launch_fused_residual_ln(__nv_bfloat16*,
+                                       const __nv_bfloat16*,
+                                       const __nv_bfloat16*,
+                                       const __nv_bfloat16*,
+                                       const __nv_bfloat16*,
+                                       const __nv_bfloat16*,
+                                       float,
+                                       int,
+                                       int,
+                                       cudaStream_t);
+#endif
+
 template void launch_fused_residual_ln(float*,
                                        const float*,
                                        const float*,
@@ -516,6 +540,20 @@ template void launch_fused_residual_ln_store_pre_ln_res(__half*,
                                                         int,
                                                         cudaStream_t);
 
+#ifdef BF16_AVAILABLE
+template void launch_fused_residual_ln_store_pre_ln_res(__nv_bfloat16*,
+                                                        __nv_bfloat16*,
+                                                        const __nv_bfloat16*,
+                                                        const __nv_bfloat16*,
+                                                        const __nv_bfloat16*,
+                                                        const __nv_bfloat16*,
+                                                        const __nv_bfloat16*,
+                                                        float,
+                                                        int,
+                                                        int,
+                                                        cudaStream_t);
+#endif
+
 template void launch_fused_residual_ln_store_pre_ln_res(float*,
                                                         float*,
                                                         const float*,
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
deleted file mode 100644
index 7f3cfc118631145cd30766cdf13d439a23c138c6..0000000000000000000000000000000000000000
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ /dev/null
@@ -1,453 +0,0 @@
-#include <limits>
-#include "custom_cuda_layers.h"
-
-#ifndef __HIP_PLATFORM_HCC__
-#include <cuda_profiler_api.h>
-#endif
-#include <cstdio>
-#include <cstdlib>
-#include <ctime>
-
-#define NORM_REG (MAX_REGISTERS)
-
-namespace cg = cooperative_groups;
-
-__global__ void fused_bias_residual_layer_norm(float* output,
-                                               const float* vals,
-                                               const float* gamma,
-                                               const float* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    float sum = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        output[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_bias_residual_layer_norm(__half* output,
-                                               const __half* vals,
-                                               const __half* gamma,
-                                               const __half* beta,
-                                               float epsilon,
-                                               int row_stride)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-    int iterations = row_stride / iteration_stride;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
-    __half2* out_cast = reinterpret_cast<__half2*>(output);
-
-    int k = 0;
-    int input_id = id;
-    while (input_id < row_stride) {
-        inp_reg[k++] = vals_cast[input_id + row * row_stride];
-        input_id += iteration_stride;
-    }
-    float sum = 0;
-    for (int f = k - 1; f >= 0; f--) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        sum += inp_f.x + inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        out_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <>
-void launch_layer_norm<float>(float* out,
-                              float* vals,
-                              const float* gamma,
-                              const float* beta,
-                              float epsilon,
-                              int batch_size,
-                              int hidden_dim,
-                              cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim);
-}
-
-template <>
-void launch_layer_norm<__half>(__half* out,
-                               __half* vals,
-                               const __half* gamma,
-                               const __half* beta,
-                               float epsilon,
-                               int batch_size,
-                               int hidden_dim,
-                               cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_bias_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(
-        out, vals, gamma, beta, epsilon, hidden_dim / 2);
-}
-
-__global__ void fused_residual_layer_norm(float* norm,
-                                          float* res_add,
-                                          float* vals,
-                                          float* residual,
-                                          const float* bias,
-                                          const float* gamma,
-                                          const float* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    float inp_reg[NORM_REG];
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals[input_id + row * row_stride];
-        float res_f = (residual[input_id + row * row_stride]);
-        float bias_f = (bias[input_id]);
-        if (mlp_after_attn) inp_reg[k] += res_f + bias_f;
-        // if (preLN) res_add[input_id + row * row_stride] = inp_reg[k];
-        sum += inp_reg[k++];
-        input_id += iteration_stride;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        inp_reg[f] -= mean;
-        sum += inp_reg[f] * inp_reg[f];
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * sum;
-        inp_reg[f] = inp_reg[f] * gamma[out_id] + beta[out_id];
-        norm[out_id + row * row_stride] = inp_reg[f];
-    }
-}
-
-__global__ void fused_residual_layer_norm(__half* norm,
-                                          __half* res_add,
-                                          __half* vals,
-                                          __half* residual,
-                                          const __half* bias,
-                                          const __half* gamma,
-                                          const __half* beta,
-                                          float epsilon,
-                                          int row_stride,
-                                          bool preLN,
-                                          bool mlp_after_attn)
-{
-#ifdef HALF_PRECISION_AVAILABLE
-    int iteration_stride = blockDim.x;
-
-    cg::thread_block b = cg::this_thread_block();
-    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
-
-    int row = blockIdx.x;
-    int id = threadIdx.x;
-    int gid = id >> 5;
-    int warp_num = iteration_stride >> 5;
-
-    __half2 inp_reg[NORM_REG];
-
-    __half2* vals_cast = reinterpret_cast<__half2*>(vals);
-    __half2* norm_cast = reinterpret_cast<__half2*>(norm);
-    __half2* res_add_cast = reinterpret_cast<__half2*>(res_add);
-    __half2* residual_cast = reinterpret_cast<__half2*>(residual);
-    const __half2* bias_cast = reinterpret_cast<const __half2*>(bias);
-
-    int k = 0;
-    int input_id = id;
-
-    float sum = 0;
-    while (input_id < row_stride) {
-        inp_reg[k] = vals_cast[input_id + row * row_stride];
-        float2 inp_f = __half22float2(inp_reg[k]);
-        float2 res_f = __half22float2(residual_cast[input_id + row * row_stride]);
-        float2 bias_f = __half22float2(bias_cast[input_id]);
-        if (mlp_after_attn) {
-            inp_f.x += res_f.x + bias_f.x;
-            inp_f.y += res_f.y + bias_f.y;
-        }
-        inp_reg[k] = __float22half2_rn(inp_f);
-        // if (preLN) res_add_cast[input_id + row * row_stride] = __float22half2_rn(res_f);
-        // //inp_reg[k];
-        sum += inp_f.x + inp_f.y;
-        input_id += iteration_stride;
-        k++;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    __shared__ float shr[MAX_WARP_NUM];
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    float mean = sum / (row_stride << 1);
-    sum = 0.f;
-    for (int f = 0; f < k; f++) {
-        float2 inp_f = __half22float2(inp_reg[f]);
-        inp_f.x -= mean;
-        inp_f.y -= mean;
-        inp_reg[f] = __float22half2_rn(inp_f);
-        sum += inp_f.x * inp_f.x;
-        sum += inp_f.y * inp_f.y;
-    }
-    for (int i = 1; i < 32; i *= 2) sum += g.shfl_down(sum, i);
-    if (g.thread_rank() == 0) shr[gid] = sum;
-    b.sync();
-    if (g.thread_rank() < (warp_num)) sum = shr[g.thread_rank()];
-    b.sync();
-    for (int i = 1; i < (warp_num); i *= 2) sum += g.shfl_down(sum, i);
-    sum = g.shfl(sum, 0);
-    sum /= (row_stride << 1);
-    sum += epsilon;
-    sum = __frsqrt_rn(sum);
-    __half2 variance_h = __float2half2_rn(sum);
-    const __half2* gamma_cast = reinterpret_cast<const __half2*>(gamma);
-    const __half2* beta_cast = reinterpret_cast<const __half2*>(beta);
-    for (int f = 0; f < k; f++) {
-        int out_id = f * iteration_stride + id;
-        inp_reg[f] = inp_reg[f] * variance_h;
-        inp_reg[f] = inp_reg[f] * gamma_cast[out_id] + beta_cast[out_id];
-        norm_cast[out_id + row * row_stride] = inp_reg[f];
-    }
-#endif
-}
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-
-template <>
-void launch_residual_layer_norm<float>(float* norm,
-                                       float* res_add,
-                                       float* vals,
-                                       float* residual,
-                                       const float* bias,
-                                       const float* gamma,
-                                       const float* beta,
-                                       float epsilon,
-                                       int batch_size,
-                                       int hidden_dim,
-                                       bool preLN,
-                                       bool mlp_after_attn,
-                                       cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
-
-template <>
-void launch_residual_layer_norm<__half>(__half* norm,
-                                        __half* res_add,
-                                        __half* vals,
-                                        __half* residual,
-                                        const __half* bias,
-                                        const __half* gamma,
-                                        const __half* beta,
-                                        float epsilon,
-                                        int batch_size,
-                                        int hidden_dim,
-                                        bool preLN,
-                                        bool mlp_after_attn,
-                                        cudaStream_t stream)
-{
-    constexpr int threads = 1024;
-
-    dim3 grid_dim(batch_size);
-    dim3 block_dim(threads);
-
-    fused_residual_layer_norm<<<grid_dim, block_dim, 0, stream>>>(norm,
-                                                                  res_add,
-                                                                  vals,
-                                                                  residual,
-                                                                  bias,
-                                                                  gamma,
-                                                                  beta,
-                                                                  epsilon,
-                                                                  hidden_dim / 2,
-                                                                  preLN,
-                                                                  mlp_after_attn);
-}
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 1d6857042ca158821e67267517a05dac35f3ef1c..eb9bcf0e4341a0de45d228d025a7a23ecb2ec557 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
@@ -95,7 +96,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            head_offset,
                            mask_stride,
                            mp_size,
-                           Context::Instance().GetCurrentStream(async_op));
+                           InferenceContext::Instance().GetCurrentStream(async_op));
 
     return attn_scores_c;
 }
@@ -109,18 +110,20 @@ void allocate_workspace(unsigned hidden_dim,
                         unsigned mp_size = 1,
                         bool external_cache = false,
                         unsigned rank = 0,
-                        unsigned max_out_tokens = 1024)
+                        unsigned max_out_tokens = 1024,
+                        unsigned min_out_tokens = 1)
 {
-    Context::Instance().GenWorkSpace(num_layers,
-                                     num_heads,
-                                     batch_size,
-                                     prompt_length,
-                                     hidden_dim,
-                                     mp_size,
-                                     external_cache,
-                                     sizeof(T),
-                                     rank,
-                                     max_out_tokens);
+    InferenceContext::Instance().GenWorkSpace(num_layers,
+                                              num_heads,
+                                              batch_size,
+                                              prompt_length,
+                                              hidden_dim,
+                                              mp_size,
+                                              external_cache,
+                                              sizeof(T),
+                                              rank,
+                                              max_out_tokens,
+                                              min_out_tokens);
 }
 
 template <typename T>
@@ -131,15 +134,15 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
                        .layout(at::kStrided)
                        .device(at::kCUDA)
                        .requires_grad(false);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     float alpha = 1;
     float gemm_beta = 0.0;
 
     /*
     // Reallocate memory if we received a new prompt
     if (!workspace || input.size(1) != 1) {
-        allocate_workspace<T>(W.size(1), Context::Instance().GetMaxTokenLenght(), Q.size(0), 1,
-    head_size); workspace = (T*)Context::Instance().GetWorkSpace();
+        allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLenght(),
+    Q.size(0), 1, head_size); workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     }
     */
 
@@ -147,7 +150,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
     unsigned m = W.size(1);
     unsigned n = Q.size(1) * Q.size(2);
     unsigned k = Q.size(0);
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_N,
                    CUBLAS_OP_T,
                    m,
@@ -194,8 +197,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
 
     auto mask_stride = get_attn_mask_stride(attn_mask);
 
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 soft_len,
                                 seq_len,
                                 k,
@@ -230,9 +234,9 @@ void attention_unfused(at::Tensor& prev_key_cont,
                            0,
                            mask_stride,
                            1,
-                           Context::Instance().GetCurrentStream(false));
+                           InferenceContext::Instance().GetCurrentStream(false));
     alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 k,
                                 seq_len,
                                 soft_len,
@@ -363,10 +367,11 @@ void attention_unfused(T* prev_key_cont,
     float layer_scale = alibi.sizes().size() > 1 ? std::max(1, layer_id) : 1.0;
     float alpha = norm_factor * norm_factor / layer_scale;
     float gemm_beta = 0.0;
-    T* workspace = (T*)Context::Instance().GetAttentionUnfusedWorkspace();
+    T* workspace = (T*)InferenceContext::Instance().GetAttentionUnfusedWorkspace();
 
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 soft_len,
                                 seq_len,
                                 k,
@@ -377,7 +382,7 @@ void attention_unfused(T* prev_key_cont,
                                 workspace,
                                 CUBLAS_OP_T,
                                 CUBLAS_OP_N,
-                                Context::Instance().GetMaxTokenLenght() * k,
+                                InferenceContext::Instance().GetMaxTokenLenght() * k,
                                 seq_len * k,
                                 seq_len * soft_len,
                                 bsz * heads,
@@ -399,7 +404,7 @@ void attention_unfused(T* prev_key_cont,
                            soft_len,
                            heads);
     alpha = 1.0;
-    cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
+    cublas_strided_batched_gemm(InferenceContext::Instance().GetCublasHandle(),
                                 k,
                                 seq_len,
                                 soft_len,
@@ -410,7 +415,7 @@ void attention_unfused(T* prev_key_cont,
                                 (T*)output,
                                 CUBLAS_OP_N,
                                 CUBLAS_OP_N,
-                                Context::Instance().GetMaxTokenLenght() * k,
+                                InferenceContext::Instance().GetMaxTokenLenght() * k,
                                 seq_len * soft_len,
                                 seq_len * k,
                                 bsz * heads,
@@ -421,7 +426,7 @@ void attention_unfused(T* prev_key_cont,
 #endif
 }
 
-void reset_cache() { Context::Instance().reset_tokens(); }
+void reset_cache() { InferenceContext::Instance().reset_tokens(); }
 
 template <typename T>
 std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
@@ -445,8 +450,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
 
     bool is_prompt = (seq_len > 1);
 
-    if (is_prompt) Context::Instance().reset_tokens(seq_len);
-    unsigned soft_len = Context::Instance().current_tokens();
+    if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
+    unsigned soft_len = InferenceContext::Instance().current_tokens();
 
     int k = hidden_dim / heads;
     auto options = at::TensorOptions()
@@ -455,16 +460,17 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     size_t buf_size = bsz * seq_len * hidden_dim;
     auto output = torch::from_blob(workspace + 4 * buf_size, {bsz, seq_len, hidden_dim}, options);
 
-    auto query_cont = workspace + 8 * buf_size;
-    size_t offset = 16 * (hidden_dim * bsz * Context::Instance().GetMaxTokenLenght()) +
-                    layer_id * 2 * bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    auto query_cont = workspace + 5 * buf_size;
+    size_t offset =
+        10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLenght()) +
+        layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim;
     unsigned all_tokens = soft_len;
     auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
-    size_t value_offset = bsz * Context::Instance().GetMaxTokenLenght() * hidden_dim;
+    size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim;
 
     T* temp_buf = (T*)output.data_ptr() + at::numel(output);
     launch_bias_add_transform_0213<T>((T*)query_cont,
@@ -481,9 +487,9 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                       rotary_dim,
                                       rotate_half,
                                       rotate_every_two,
-                                      Context::Instance().GetCurrentStream(),
+                                      InferenceContext::Instance().GetCurrentStream(),
                                       3,
-                                      Context::Instance().GetMaxTokenLenght());
+                                      InferenceContext::Instance().GetMaxTokenLenght());
     if (rotary_dim > 0 && rotate_half)
         launch_apply_rotary_pos_emb(query_cont,
                                     kv_cache,
@@ -495,8 +501,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                     bsz,
                                     rotate_half,
                                     rotate_every_two,
-                                    Context::Instance().GetCurrentStream(),
-                                    Context::Instance().GetMaxTokenLenght());
+                                    InferenceContext::Instance().GetCurrentStream(),
+                                    InferenceContext::Instance().GetMaxTokenLenght());
 
     attention_unfused<T>(workspace + offset,
                          (T*)query_cont,
@@ -521,13 +527,27 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                heads,
                                seq_len,
                                output.size(2),
-                               Context::Instance().GetCurrentStream(false),
+                               InferenceContext::Instance().GetCurrentStream(false),
                                1);
 
-    if (layer_id == num_layers - 1) Context::Instance().advance_tokens();
-    auto prev_key = torch::from_blob(workspace + offset, {bsz, heads, all_tokens, k}, options);
+    if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens();
+    auto prev_key = torch::from_blob(workspace + offset,
+                                     {bsz, heads, all_tokens, k},
+                                     {hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(),
+                                      k * InferenceContext::Instance().GetMaxTokenLenght(),
+                                      k,
+                                      1},
+                                     options);
+
     auto prev_value =
-        torch::from_blob(workspace + offset + value_offset, {bsz, heads, all_tokens, k}, options);
+        torch::from_blob(workspace + offset + value_offset,
+                         {bsz, heads, all_tokens, k},
+                         {hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(),
+                          k * InferenceContext::Instance().GetMaxTokenLenght(),
+                          k,
+                          1},
+                         options);
+
     return {output, prev_key, prev_value};
 }
 
@@ -543,7 +563,7 @@ at::Tensor ds_bias_gelu(at::Tensor& input, at::Tensor& bias)
                      (T*)bias.data_ptr(),
                      intermediate_size,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -569,14 +589,14 @@ at::Tensor ds_bias_geglu(at::Tensor& activation, at::Tensor& bias)
                                 (const float*)bias.data_ptr(),
                                 rows,
                                 channels,
-                                Context::Instance().GetCurrentStream());
+                                InferenceContext::Instance().GetCurrentStream());
     } else {
         launch_fused_bias_geglu((__half*)output.data_ptr(),
                                 (const __half*)activation.data_ptr(),
                                 (const __half*)bias.data_ptr(),
                                 rows,
                                 channels,
-                                Context::Instance().GetCurrentStream());
+                                InferenceContext::Instance().GetCurrentStream());
     }
 
     return output;
@@ -594,7 +614,7 @@ at::Tensor ds_bias_relu(at::Tensor& input, at::Tensor& bias)
                      (T*)bias.data_ptr(),
                      intermediate_size,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -610,7 +630,7 @@ at::Tensor ds_bias_add(at::Tensor& input, at::Tensor& bias)
                     (T*)bias.data_ptr(),
                     hidden_size,
                     bsz,
-                    Context::Instance().GetCurrentStream());
+                    InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -627,7 +647,7 @@ at::Tensor ds_bias_residual(at::Tensor& input, at::Tensor& residual, at::Tensor&
     //                      bsz,
     //                      input_cont.size(2),
     //                      (bias.size(0) > 1),
-    //                      Context::Instance().GetCurrentStream());
+    //                      InferenceContext::Instance().GetCurrentStream());
     return input_cont;
 }
 
@@ -645,7 +665,7 @@ at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tensor& beta,
                         epsilon,
                         rows,
                         elems_per_row,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     } else {
         launch_fused_ln((float*)output.data_ptr(),
                         (const float*)input.data_ptr(),
@@ -654,7 +674,7 @@ at::Tensor ds_layer_norm(at::Tensor& input, at::Tensor& gamma, at::Tensor& beta,
                         epsilon,
                         rows,
                         elems_per_row,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     }
 
     return output;
@@ -675,7 +695,7 @@ void ds_layer_norm_internal(T* workspace,
                     epsilon,
                     bsz,
                     input.size(2),
-                    Context::Instance().GetCurrentStream());
+                    InferenceContext::Instance().GetCurrentStream());
 }
 
 /* Currently only used in unit testing */
@@ -700,7 +720,7 @@ at::Tensor ds_layer_norm_residual(at::Tensor& input,
                                  epsilon,
                                  rows,
                                  elems_per_row,
-                                 Context::Instance().GetCurrentStream());
+                                 InferenceContext::Instance().GetCurrentStream());
     } else {
         launch_fused_residual_ln((float*)output.data_ptr(),
                                  (const float*)input.data_ptr(),
@@ -711,7 +731,7 @@ at::Tensor ds_layer_norm_residual(at::Tensor& input,
                                  epsilon,
                                  rows,
                                  elems_per_row,
-                                 Context::Instance().GetCurrentStream());
+                                 InferenceContext::Instance().GetCurrentStream());
     }
 
     return output;
@@ -741,7 +761,7 @@ std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& inpu
                                                   epsilon,
                                                   rows,
                                                   elems_per_row,
-                                                  Context::Instance().GetCurrentStream());
+                                                  InferenceContext::Instance().GetCurrentStream());
     } else {
         launch_fused_residual_ln_store_pre_ln_res((float*)norm_output.data_ptr(),
                                                   (float*)res_output.data_ptr(),
@@ -753,7 +773,7 @@ std::vector<at::Tensor> ds_layer_norm_residual_store_pre_ln_res(at::Tensor& inpu
                                                   epsilon,
                                                   rows,
                                                   elems_per_row,
-                                                  Context::Instance().GetCurrentStream());
+                                                  InferenceContext::Instance().GetCurrentStream());
     }
 
     return {norm_output, res_output};
@@ -768,7 +788,7 @@ void quantized_gemm(void* output,
                     int bsz,
                     int hidden_size)
 {
-    // T* weight16 = (T*)Context::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
+    // T* weight16 = (T*)InferenceContext::Instance().GetWorkSpace() + 12 * hidden_size * bsz;
 
     auto options = at::TensorOptions()
                        .dtype(at::kHalf)
@@ -783,11 +803,11 @@ void quantized_gemm(void* output,
                       weight.size(0),
                       weight.size(1),
                       groups,
-                      Context::Instance().GetCurrentStream());
+                      InferenceContext::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_T,
                    CUBLAS_OP_N,
                    weight.size(0),
@@ -815,10 +835,11 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
                               at::Tensor& beta,
                               const float epsilon,
                               bool add_bias,
-                              bool q_int8)
+                              bool q_int8,
+                              bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     workspace += (3 * bsz * input.size(2));
     ds_layer_norm_internal<T>(workspace, input, gamma, beta, epsilon);
 
@@ -829,12 +850,12 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
 
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -851,9 +872,9 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        q_int8 ? weight.size(0) : weight.size(1),
+                        (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                         bsz,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     return torch::from_blob(workspace, input.sizes(), input.options());
 }
 
@@ -870,11 +891,12 @@ std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                                     bool external_cache,
                                     unsigned mp_size,
                                     unsigned rank,
-                                    bool q_int8)
+                                    bool q_int8,
+                                    bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
-    int out_size = q_int8 ? weight.size(0) : weight.size(1);
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
+    int out_size = (transposed_mode || q_int8) ? weight.size(0) : weight.size(1);
 
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -883,8 +905,17 @@ std::vector<at::Tensor> ds_qkv_gemm(at::Tensor& input,
                        .requires_grad(false);
 
     auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
-    auto inp_norm = qkv_unfused_cublas<T>(
-        output, input, weight, q_scale, bias, gamma, beta, epsilon, add_bias, q_int8);
+    auto inp_norm = qkv_unfused_cublas<T>(output,
+                                          input,
+                                          weight,
+                                          q_scale,
+                                          bias,
+                                          gamma,
+                                          beta,
+                                          epsilon,
+                                          add_bias,
+                                          q_int8,
+                                          transposed_mode);
 
     return {output, inp_norm};
 }
@@ -912,11 +943,11 @@ void quantized_gemm(at::Tensor& output,
                       weight.size(1),
                       groups,
                       merge_count,
-                      Context::Instance().GetCurrentStream());
+                      InferenceContext::Instance().GetCurrentStream());
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
                    CUBLAS_OP_T,
                    CUBLAS_OP_N,
                    weight.size(0),
@@ -963,7 +994,7 @@ at::Tensor ds_qkv_gemm_int8(at::Tensor& input,
                         (T*)bias.data_ptr(),
                         weight.size(1),
                         bsz,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
 
     return output;
 }
@@ -974,7 +1005,8 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                            at::Tensor& bias,
                            bool add_bias,
                            bool do_flash_attn,
-                           int num_heads)
+                           int num_heads,
+                           bool transposed_mode)
 {
     auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
@@ -985,17 +1017,18 @@ at::Tensor ds_linear_layer(at::Tensor& input,
 
     int head_size = input_cont.size(2) / num_heads;
     int bsz = input.size(0) * input.size(1);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     auto output = at::from_blob(workspace, {input.size(0), input.size(1), weight.size(1)}, options);
 
     float alpha = (T)1.0;
     float gemm_beta = (T)0.0;
-    cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
+    cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                    InferenceContext::Instance().GetCurrentStream());
 
-    cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                   CUBLAS_OP_N,
+    cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                   (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                    CUBLAS_OP_N,
-                   weight.size(1),
+                   weight.size(transposed_mode ? 0 : 1),
                    bsz,
                    input_cont.size(2),
                    &alpha,
@@ -1011,9 +1044,9 @@ at::Tensor ds_linear_layer(at::Tensor& input,
     if (add_bias)
         launch_bias_add((T*)output.data_ptr(),
                         (T*)bias.data_ptr(),
-                        weight.size(1),
+                        weight.size(transposed_mode ? 0 : 1),
                         bsz,
-                        Context::Instance().GetCurrentStream());
+                        InferenceContext::Instance().GetCurrentStream());
     bool add_padding = (head_size % 32 != 0 && head_size < 64) || (head_size % 64 != 0);
     if (do_flash_attn) {
         if (add_padding) {
@@ -1026,7 +1059,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                      3 * bsz * num_heads,
                      head_size,
                      padded_head_size,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
             launch_bias_add_transform_0213<T>(
                 final_output,
@@ -1043,7 +1076,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 -1,
                 false,
                 false,
-                Context::Instance().GetCurrentStream(),
+                InferenceContext::Instance().GetCurrentStream(),
                 3,
                 input.size(1));
             return at::from_blob(final_output,
@@ -1068,7 +1101,7 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                 -1,
                 false,
                 false,
-                Context::Instance().GetCurrentStream(),
+                InferenceContext::Instance().GetCurrentStream(),
                 3,
                 input.size(1));
             return at::from_blob(
@@ -1086,7 +1119,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
 {
     int head_size = query.size(3);
     int padded_head_size = head_size < 32 ? 32 : (head_size < 64 ? 64 : 128);
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     T* key_pad_ptr = workspace + padded_head_size * query.size(0) * query.size(1) * query.size(2);
     T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * query.size(1) * 128;
     pad_head_seq(workspace,
@@ -1096,7 +1129,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  query.size(2),
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     pad_head_seq(key_pad_ptr,
                  (T*)key.data_ptr(),
                  query.size(0) * query.size(1),
@@ -1104,7 +1137,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  128,
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     pad_head_seq(value_pad_ptr,
                  (T*)value.data_ptr(),
                  query.size(0) * query.size(1),
@@ -1112,7 +1145,7 @@ std::vector<at::Tensor> add_padding(at::Tensor& query, at::Tensor& key, at::Tens
                  128,
                  head_size,
                  padded_head_size,
-                 Context::Instance().GetCurrentStream());
+                 InferenceContext::Instance().GetCurrentStream());
     return {
         at::from_blob(workspace,
                       {query.size(0), query.size(1), query.size(2), padded_head_size},
@@ -1134,7 +1167,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
     int key_value_length = add_padding ? 128 : key.size(1);
     int padded_head_size = add_padding ? (head_size < 32 ? 32 : (head_size < 64 ? 64 : 128))
                                        : head_size;
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     T* key_pad_ptr = workspace + padded_head_size * query.size(0) * heads * query.size(1);
     T* value_pad_ptr = key_pad_ptr + padded_head_size * query.size(0) * heads * key_value_length;
     launch_pad_add_transform_0213(workspace,
@@ -1145,7 +1178,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   query.size(1),
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     launch_pad_add_transform_0213(key_pad_ptr,
                                   (T*)key.data_ptr(),
                                   key.size(0),
@@ -1154,7 +1187,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   key_value_length,
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     launch_pad_add_transform_0213(value_pad_ptr,
                                   (T*)value.data_ptr(),
                                   value.size(0),
@@ -1163,7 +1196,7 @@ std::vector<at::Tensor> padd_add_transform(at::Tensor& query,
                                   key_value_length,
                                   heads,
                                   padded_head_size,
-                                  Context::Instance().GetCurrentStream());
+                                  InferenceContext::Instance().GetCurrentStream());
     return {
         at::from_blob(
             workspace, {query.size(0), heads, query.size(1), padded_head_size}, query.options()),
@@ -1196,7 +1229,7 @@ at::Tensor ds_linear_layer_int8(at::Tensor& input,
                     (T*)bias.data_ptr(),
                     weight.size(1),
                     bsz,
-                    Context::Instance().GetCurrentStream());
+                    InferenceContext::Instance().GetCurrentStream());
     return output;
 }
 
@@ -1205,7 +1238,8 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
                             at::Tensor& weight,
                             bool async_op,
                             at::Tensor& q_scale,
-                            bool q_int8)
+                            bool q_int8,
+                            bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -1215,7 +1249,7 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
     int out_size = q_int8 ? weight.size(0) : weight.size(1);
     int bsz = input.size(0) * input.size(1);
 
-    T* workspace = (T*)Context::Instance().GetWorkSpace();
+    T* workspace = (T*)InferenceContext::Instance().GetWorkSpace();
     auto output = at::from_blob(workspace, {input.size(0), input.size(1), out_size}, options);
     if (q_int8) {
         quantized_gemm<T>(output.data_ptr(),
@@ -1228,12 +1262,12 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream(async_op));
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream(async_op));
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -1286,11 +1320,12 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                               at::Tensor& q_scale,
                               at::Tensor& q_scale1,
                               bool q_int8,
-                              ActivationFuncType act_func_type)
+                              ActivationFuncType act_func_type,
+                              bool transposed_mode)
 {
     int bsz = input.size(0) * input.size(1);
-    T* inp_norm =
-        (T*)Context::Instance().GetWorkSpace() + torch::numel(input) + torch::numel(output);
+    T* inp_norm = (T*)InferenceContext::Instance().GetWorkSpace() + torch::numel(input) +
+                  torch::numel(output);
     T* intermediate = inp_norm + torch::numel(input);
 
     if (mlp_after_attn) {
@@ -1303,7 +1338,7 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                                  epsilon,
                                  bsz,
                                  input.size(2),
-                                 Context::Instance().GetCurrentStream());
+                                 InferenceContext::Instance().GetCurrentStream());
     } else {
         ds_layer_norm_internal(inp_norm, input, gamma, beta, epsilon);
     }
@@ -1313,12 +1348,12 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       weight.size(1),
+                       weight.size(transposed_mode ? 0 : 1),
                        bsz,
                        input.size(2),
                        &alpha,
@@ -1335,15 +1370,15 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     if (act_func_type == ActivationFuncType::GELU) {
         launch_bias_gelu(intermediate,
                          (T*)bias.data_ptr(),
-                         q_int8 ? weight.size(0) : weight.size(1),
+                         (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                          bsz,
-                         Context::Instance().GetCurrentStream());
+                         InferenceContext::Instance().GetCurrentStream());
     } else if (act_func_type == ActivationFuncType::ReLU) {
         launch_bias_relu(intermediate,
                          (T*)bias.data_ptr(),
-                         q_int8 ? weight.size(0) : weight.size(1),
+                         (transposed_mode || q_int8) ? weight.size(0) : weight.size(1),
                          bsz,
-                         Context::Instance().GetCurrentStream());
+                         InferenceContext::Instance().GetCurrentStream());
     }
 
     if (q_int8) {
@@ -1357,14 +1392,14 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
     } else {
         float alpha = (T)1.0;
         float gemm_beta = (T)0.0;
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
-                       weight1.size(1),
+                       weight1.size(transposed_mode ? 0 : 1),
                        bsz,
-                       weight1.size(0),
+                       weight1.size(transposed_mode ? 1 : 0),
                        &alpha,
                        &gemm_beta,
                        (T*)weight1.data_ptr(),
@@ -1395,7 +1430,8 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                                     at::Tensor& q_scale,
                                     at::Tensor& q_scale1,
                                     bool q_int8,
-                                    int activation_type)
+                                    int activation_type,
+                                    bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -1403,10 +1439,11 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
-    auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
-                                {input.size(0), input.size(1), out_size},
-                                options);
+    int out_size = (q_int8 || transposed_mode) ? weight_out.size(0) : weight_out.size(1);
+    auto output =
+        at::from_blob((T*)InferenceContext::Instance().GetWorkSpace() + torch::numel(input),
+                      {input.size(0), input.size(1), out_size},
+                      options);
     int bsz = input.size(0) * input.size(1);
 
     auto act_func_type = static_cast<ActivationFuncType>(activation_type);
@@ -1425,7 +1462,8 @@ std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
                                          q_scale,
                                          q_scale1,
                                          q_int8,
-                                         act_func_type);
+                                         act_func_type,
+                                         transposed_mode);
 
     return {output, res_add};
 }
@@ -1461,7 +1499,7 @@ std::vector<at::Tensor> ds_mlp_gemm_int8(at::Tensor& input,
                      (T*)bias.data_ptr(),
                      weight.size(1),
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
     return {output, residual_add};
 }
@@ -1476,7 +1514,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                            const float epsilon,
                            bool preLayerNorm,
                            bool q_int8,
-                           bool async_op)
+                           bool async_op,
+                           bool transposed_mode)
 {
     auto options = at::TensorOptions()
                        .dtype(input.options().dtype())
@@ -1484,9 +1523,10 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        .device(at::kCUDA)
                        .requires_grad(false);
 
-    int intm_dim = q_int8 ? weight.size(0) : weight.size(1);
+    int intm_dim = (transposed_mode || q_int8) ? weight.size(0) : weight.size(1);
 
-    // auto output = at::from_blob((T*)Context::Instance().GetWorkSpace() + torch::numel(input),
+    // auto output = at::from_blob((T*)InferenceContext::Instance().GetWorkSpace() +
+    // torch::numel(input),
     //                            {input.size(0), input.size(1), out_size},
     //                            options);
     // T* intermediate = (T*)input.data_ptr() + torch::numel(input);
@@ -1505,10 +1545,10 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                           bsz,
                           input.size(2));
     } else {
-        cublasSetStream(Context::Instance().GetCublasHandle(),
-                        Context::Instance().GetCurrentStream());
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublasSetStream(InferenceContext::Instance().GetCublasHandle(),
+                        InferenceContext::Instance().GetCurrentStream());
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
                        intm_dim,
                        bsz,
@@ -1528,9 +1568,9 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                      (T*)bias.data_ptr(),
                      intm_dim,
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
-    int out_size = q_int8 ? weight_out.size(0) : weight_out.size(1);
+    int out_size = (transposed_mode || q_int8) ? weight_out.size(0) : weight_out.size(1);
     auto output = at::empty({input.size(0), input.size(1), out_size}, options);
     if (q_int8) {
         quantized_gemm<T>(output.data_ptr(),
@@ -1541,8 +1581,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                           bsz,
                           input.size(2));
     } else {
-        cublas_gemm_ex(Context::Instance().GetCublasHandle(),
-                       CUBLAS_OP_N,
+        cublas_gemm_ex(InferenceContext::Instance().GetCublasHandle(),
+                       (transposed_mode ? CUBLAS_OP_T : CUBLAS_OP_N),
                        CUBLAS_OP_N,
                        out_size,
                        bsz,
@@ -1558,8 +1598,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
     }
-    // cudaEventRecord(Context::Instance().GetCompEvent(2),
-    //                Context::Instance().GetCurrentStream(true));
+    // cudaEventRecord(InferenceContext::Instance().GetCompEvent(2),
+    //                InferenceContext::Instance().GetCurrentStream(true));
     return output;
 }
 
@@ -1586,7 +1626,7 @@ at::Tensor& residual_add_bias(at::Tensor& hidden_state,
                              hidden_size,
                              mp_size,
                              preln,
-                             Context::Instance().GetCurrentStream());
+                             InferenceContext::Instance().GetCurrentStream());
     else
         launch_gptj_residual_add<T>(
             static_cast<T*>(residual.data_ptr()),
@@ -1597,7 +1637,7 @@ at::Tensor& residual_add_bias(at::Tensor& hidden_state,
             hidden_size,
             bsz,
             mp_size,
-            Context::Instance().GetCurrentStream());
+            InferenceContext::Instance().GetCurrentStream());
     return residual;
 }
 
@@ -1627,8 +1667,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                            bsz,
                                            rotate_half,
                                            rotate_every_two,
-                                           Context::Instance().GetCurrentStream(),
-                                           Context::Instance().GetMaxTokenLenght());
+                                           InferenceContext::Instance().GetCurrentStream(),
+                                           InferenceContext::Instance().GetMaxTokenLenght());
     else
         launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
                                             (__half*)key_cont.data_ptr(),
@@ -1640,8 +1680,8 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
                                             bsz,
                                             rotate_half,
                                             rotate_every_two,
-                                            Context::Instance().GetCurrentStream(),
-                                            Context::Instance().GetMaxTokenLenght());
+                                            InferenceContext::Instance().GetCurrentStream(),
+                                            InferenceContext::Instance().GetMaxTokenLenght());
     return {query_cont, key_cont};
 }
 
@@ -1670,7 +1710,7 @@ at::Tensor fused_gemm_gelu_int8(at::Tensor& input,
                      (T*)bias.data_ptr(),
                      weight.size(1),
                      bsz,
-                     Context::Instance().GetCurrentStream());
+                     InferenceContext::Instance().GetCurrentStream());
 
     return output;
 }
@@ -1679,7 +1719,7 @@ at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& out
 {
     int M = moe_res.size(0) * moe_res.size(1);
     int N = moe_res.size(2);
-    Context::Instance().SynchComm();
+    InferenceContext::Instance().SynchComm();
     if (moe_res.scalar_type() == at::kFloat) {
         launch_moe_res_matmul<float>((float*)moe_res.data_ptr(),
                                      (float*)coef.data_ptr(),
@@ -1698,83 +1738,77 @@ at::Tensor moe_res_matmul(at::Tensor& moe_res, at::Tensor& coef, at::Tensor& out
     return output;
 }
 
+void ds_release_workspace() { InferenceContext::Instance().release_workspace(); }
+
+bool ds_retake_workspace() { return InferenceContext::Instance().retake_workspace(); }
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-    m.def("softmax_fp32", &ds_softmax<float>, "DeepSpeed SoftMax with fp32 (CUDA)");
-    m.def("softmax_fp16", &ds_softmax<__half>, "DeepSpeed SoftMax with fp16 (CUDA)");
-    m.def(
-        "softmax_context_fp32", &ds_softmax_context<float>, "DeepSpeed attention with fp32 (CUDA)");
-    m.def("softmax_context_fp16",
-          &ds_softmax_context<__half>,
-          "DeepSpeed attention with fp16 (CUDA)");
     m.def("softmax_context_int8",
           &ds_softmax_context1<__half>,
           "DeepSpeed attention with int8 (CUDA)");
-    m.def("bias_gelu_fp32", &ds_bias_gelu<float>, "DeepSpeed Gelu with fp32 (CUDA)");
-    m.def("bias_gelu_fp16", &ds_bias_gelu<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
     m.def("bias_geglu", &ds_bias_geglu, "DeepSpeed Bias GEGLU (CUDA)");
-    m.def("bias_add_fp32", &ds_bias_add<float>, "DeepSpeed Bias Add with fp32 (CUDA)");
-    m.def("bias_add_fp16", &ds_bias_add<__half>, "DeepSpeed Gelu with fp16 (CUDA)");
-    m.def("bias_relu_fp32", &ds_bias_relu<float>, "DeepSpeed ReLU with fp32 (CUDA)");
-    m.def("bias_relu_fp16", &ds_bias_relu<__half>, "DeepSpeed ReLU with fp16 (CUDA)");
-    m.def("bias_residual_fp32",
-          &ds_bias_residual<float>,
-          "DeepSpeed residual-bias add with fp32 (CUDA)");
-    m.def("bias_residual_fp16",
-          &ds_bias_residual<__half>,
-          "DeepSpeed residual-bias add with fp16 (CUDA)");
     m.def("layer_norm", &ds_layer_norm, "DeepSpeed layer norm (CUDA)");
     m.def(
         "_layer_norm_residual", &ds_layer_norm_residual, "DeepSpeed layer norm + residual (CUDA)");
     m.def("layer_norm_residual_store_pre_ln_res",
           &ds_layer_norm_residual_store_pre_ln_res,
           "DeepSpeed layer norm + store pre Layernorm residual (CUDA)");
-    m.def("qkv_gemm_fp32", &ds_qkv_gemm<float>, "DeepSpeed qkv gemm with fp32 (CUDA)");
-    m.def("qkv_gemm_fp16", &ds_qkv_gemm<__half>, "DeepSpeed qkv gemm with fp16 (CUDA)");
     m.def("qkv_gemm_int8", &ds_qkv_gemm_int8<__half>, "DeepSpeed qkv gemm with int8 (CUDA)");
-    m.def("mlp_gemm_fp32", &ds_mlp_gemm<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("mlp_gemm_fp16", &ds_mlp_gemm<__half>, "DeepSpeed mlp with fp16 (CUDA)");
     m.def("mlp_gemm_int8", &ds_mlp_gemm_int8<__half>, "DeepSpeed mlp with int8 (CUDA)");
-    m.def("vector_matmul_fp32", &ds_vector_matmul<float>, "DeepSpeed vector-MM with fp32 (CUDA)");
-    m.def("vector_matmul_fp16", &ds_vector_matmul<__half>, "DeepSpeed vector-MM with fp16 (CUDA)");
     m.def("vector_matmul_int8",
           &ds_vector_matmul_int8<__half>,
           "DeepSpeed vector-MM with int8 (CUDA)");
-    m.def("linear_layer_fp32", &ds_linear_layer<float>, "DeepSpeed linear_layer with fp32 (CUDA)");
-    m.def("linear_layer_fp16", &ds_linear_layer<__half>, "DeepSpeed linear_layer with fp16 (CUDA)");
     m.def("linear_layer_int8",
           &ds_linear_layer_int8<__half>,
           "DeepSpeed linear_layer with int8 (CUDA)");
-    m.def("fused_gemm_gelu_fp32", &fused_gemm_gelu<float>, "DeepSpeed mlp with fp32 (CUDA)");
-    m.def("fused_gemm_gelu_fp16", &fused_gemm_gelu<__half>, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("residual_add_bias_fp32",
-          &residual_add_bias<float>,
-          "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("residual_add_bias_fp16",
-          &residual_add_bias<__half>,
-          "DeepSpeed residual add with fp16 (CUDA)");
     m.def("apply_rotary_pos_emb", &apply_rotary_pos_emb, "DeepSpeed mlp with fp16 (CUDA)");
-    m.def("einsum_sec_sm_ecm_fp32",
-          &einsum_sec_sm_ecm<float>,
-          "DeepSpeed vector-MM with fp32 (CUDA)");
-
-    m.def("einsum_sec_sm_ecm_fp16",
-          &einsum_sec_sm_ecm<__half>,
-          "DeepSpeed vector-MM with fp16 (CUDA)");
     m.def("moe_res_matmul", &moe_res_matmul, "DeepSpeed moe residual matmul (CUDA)");
-    m.def("add_padding_fp32", &add_padding<float>, "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("add_padding_fp16", &add_padding<__half>, "DeepSpeed residual add with fp16 (CUDA)");
-    m.def("pad_transform_fp32",
-          &padd_add_transform<float>,
-          "DeepSpeed residual add with fp32 (CUDA)");
-    m.def("pad_transform_fp16",
-          &padd_add_transform<__half>,
-          "DeepSpeed residual add with fp16 (CUDA)");
-    m.def("allocate_workspace_fp32",
-          &allocate_workspace<float>,
-          "DeepSpeed memory allocation for GPT inference with fp32 (CUDA)");
-    m.def("allocate_workspace_fp16",
-          &allocate_workspace<__half>,
-          "DeepSpeed memory allocation for GPT inference with fp16 (CUDA)");
     m.def("reset_cache", &reset_cache, "Reset Cache for generation tasks");
+    m.def("release_workspace", &ds_release_workspace, "DeepSpeed Release Workspace");
+    m.def("retake_workspace", &ds_retake_workspace, "DeepSpeed Retake Workspace");
+
+#define DEF_OPS(_name, _dtype)                                                                    \
+    m.def("softmax_" #_name, &ds_softmax<_dtype>, "DeepSpeed SoftMax with " #_name " (CUDA)");    \
+    m.def("softmax_context_" #_name,                                                              \
+          &ds_softmax_context<_dtype>,                                                            \
+          "DeepSpeed attention with _name (CUDA)");                                               \
+    m.def("bias_gelu_" #_name, &ds_bias_gelu<_dtype>, "DeepSpeed Gelu with " #_name " (CUDA)");   \
+    m.def("bias_add_" #_name, &ds_bias_add<_dtype>, "DeepSpeed Bias Add with " #_name " (CUDA)"); \
+    m.def("bias_relu_" #_name, &ds_bias_relu<_dtype>, "DeepSpeed ReLU with " #_name " (CUDA)");   \
+    m.def("bias_residual_" #_name,                                                                \
+          &ds_bias_residual<_dtype>,                                                              \
+          "DeepSpeed residual-bias add with " #_name " (CUDA)");                                  \
+    m.def("qkv_gemm_" #_name, &ds_qkv_gemm<_dtype>, "DeepSpeed qkv gemm with " #_name " (CUDA)"); \
+    m.def("mlp_gemm_" #_name, &ds_mlp_gemm<_dtype>, "DeepSpeed mlp with " #_name " (CUDA)");      \
+    m.def("vector_matmul_" #_name,                                                                \
+          &ds_vector_matmul<_dtype>,                                                              \
+          "DeepSpeed vector-MM with " #_name " (CUDA)");                                          \
+    m.def("linear_layer_" #_name,                                                                 \
+          &ds_linear_layer<_dtype>,                                                               \
+          "DeepSpeed linear_layer with " #_name " (CUDA)");                                       \
+    m.def("fused_gemm_gelu_" #_name,                                                              \
+          &fused_gemm_gelu<_dtype>,                                                               \
+          "DeepSpeed mlp with " #_name " (CUDA)");                                                \
+    m.def("residual_add_bias_" #_name,                                                            \
+          &residual_add_bias<_dtype>,                                                             \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("einsum_sec_sm_ecm_" #_name,                                                            \
+          &einsum_sec_sm_ecm<_dtype>,                                                             \
+          "DeepSpeed vector-MM with " #_name " (CUDA)");                                          \
+    m.def("add_padding_" #_name,                                                                  \
+          &add_padding<_dtype>,                                                                   \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("pad_transform_" #_name,                                                                \
+          &padd_add_transform<_dtype>,                                                            \
+          "DeepSpeed residual add with " #_name " (CUDA)");                                       \
+    m.def("allocate_workspace_" #_name,                                                           \
+          &allocate_workspace<_dtype>,                                                            \
+          "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)")
+
+    DEF_OPS(fp32, float);
+    DEF_OPS(fp16, __half);
+#ifdef BF16_AVAILABLE
+    DEF_OPS(bf16, __nv_bfloat16);
+#endif
 }
diff --git a/csrc/transformer/inference/csrc/relu.cu b/csrc/transformer/inference/csrc/relu.cu
index 87e169a9194ff714fa15ecbef165944d95eca1e9..fd38d1a88d4b67ce4343d4e25ba8159a0eb280d1 100644
--- a/csrc/transformer/inference/csrc/relu.cu
+++ b/csrc/transformer/inference/csrc/relu.cu
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "conversion_utils.h"
 #include "inference_cuda_layers.h"
@@ -60,4 +61,11 @@ void launch_bias_relu(T* input,
 }
 
 template void launch_bias_relu<float>(float*, const float*, int, int, cudaStream_t);
+#ifdef BF16_AVAILABLE
+template void launch_bias_relu<__nv_bfloat16>(__nv_bfloat16*,
+                                              const __nv_bfloat16*,
+                                              int,
+                                              int,
+                                              cudaStream_t);
+#endif
 template void launch_bias_relu<__half>(__half*, const __half*, int, int, cudaStream_t);
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index c5f04176203b444c196bff72827850b1bb8cc6ad..7befdfd51497975b642f59286e0c7f0867d23b09 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -1,8 +1,10 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <limits>
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 
 #ifndef __HIP_PLATFORM_HCC__
@@ -12,7 +14,6 @@ Copyright 2022 The Microsoft DeepSpeed Team
 #include <cstdlib>
 #include <ctime>
 
-#define ATTN_THREADS 256
 #define MAX_REG_SIZE 8
 
 #define minus_infinity -10000.0
@@ -30,9 +31,10 @@ void CheckCudaErrorAux(const char* file, unsigned line)
 
 namespace cg = cooperative_groups;
 
-__global__ void attn_softmax_v2(__half* vals,
-                                __half* mask,
-                                __half* alibi,
+template <typename T, int iterations>
+__global__ void attn_softmax_v2(T* vals,
+                                T* mask,
+                                T* alibi,
                                 float layer_scale,
                                 bool triangular,
                                 bool recompute,
@@ -45,7 +47,6 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int head_offset,
                                 int mask_stride,
                                 int mp_size,
-                                int iterations,
                                 int reduceWidth)
 {
     cg::thread_block b = cg::this_thread_block();
@@ -53,7 +54,7 @@ __global__ void attn_softmax_v2(__half* vals,
 
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
-    const __half zero_h = __float2half(0.f);
+    const T zero_h = conversion::to<T>(0.f);
 
     int wid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
@@ -75,7 +76,6 @@ __global__ void attn_softmax_v2(__half* vals,
         alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
         mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
 
         int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
         int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -87,83 +87,109 @@ __global__ void attn_softmax_v2(__half* vals,
         float max_val = minus_infinity;
         // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                     (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                      (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                      (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        low_data[i].y =
-                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        high_data[i].x =
-                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                        high_data[i].y =
-                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
-                    }
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
-                } else {
-                    low_data[i].x = data_id > window_stride
-                                        ? __half2float(vals[data_id]) * layer_scale
-                                        : minus_infinity;
-                    low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
-                                      (data_id + 1) > window_stride) &&
-                                     (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1]) * layer_scale
-                                        : minus_infinity;
-                    high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
-                                       (data_id + 2) > window_stride) &&
-                                      (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2]) * layer_scale
-                                         : minus_infinity;
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y =
-                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x =
-                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
-                    }
-                    high_data[i].y = minus_infinity;
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                    }
-                }
-                // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
-                max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
-                max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
-                max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
-                max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool low_x_check = check && (data_id < sequence_length) &&
+                               (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool low_y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                               (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                               ((data_id + reduceWidth) > window_stride);
+            bool high_x_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                                ((data_id + reduceWidth * 2) > window_stride);
+            bool high_y_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                                (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                                ((data_id + reduceWidth * 3) > window_stride);
+
+            if (mask && alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset])) +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3])) +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (mask) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(mask[data_id + mask_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(mask[data_id + mask_offset + reduceWidth * 3]))
+                        : minus_infinity;
+            } else if (alibi) {
+                low_data[i].x = low_x_check
+                                    ? conversion::to<float>(vals[data_id]) * layer_scale +
+                                          (conversion::to<float>(alibi[data_id + alibi_offset]))
+                                    : minus_infinity;
+                low_data[i].y =
+                    low_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale +
+                              (conversion::to<float>(alibi[data_id + alibi_offset + reduceWidth]))
+                        : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 2]))
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale +
+                              (conversion::to<float>(
+                                  alibi[data_id + alibi_offset + reduceWidth * 3]))
+                        : minus_infinity;
             } else {
-                low_data[i].x = minus_infinity;
-                low_data[i].y = minus_infinity;
-                high_data[i].x = minus_infinity;
-                high_data[i].y = minus_infinity;
+                low_data[i].x = low_x_check ? conversion::to<float>(vals[data_id]) * layer_scale
+                                            : minus_infinity;
+                low_data[i].y =
+                    low_y_check ? conversion::to<float>(vals[data_id + reduceWidth]) * layer_scale
+                                : minus_infinity;
+                high_data[i].x =
+                    high_x_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 2]) * layer_scale
+                        : minus_infinity;
+                high_data[i].y =
+                    high_y_check
+                        ? conversion::to<float>(vals[data_id + reduceWidth * 3]) * layer_scale
+                        : minus_infinity;
             }
+
+            // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
+            max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
+            max_val = (low_data[i].y > max_val ? low_data[i].y : max_val);
+            max_val = (high_data[i].x > max_val ? high_data[i].x : max_val);
+            max_val = (high_data[i].y > max_val ? high_data[i].y : max_val);
         }
 
         for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -212,26 +238,21 @@ __global__ void attn_softmax_v2(__half* vals,
         }
         sum += 1e-6;
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
             if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
-                } else {
-                    vals[data_id] = __float2half(low_data[i].x / sum);
-                    if ((data_id + 1) < sequence_length)
-                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
-                    if ((data_id + 2) < sequence_length)
-                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
-                }
+                vals[data_id] = conversion::to<T>(low_data[i].x / sum);
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = conversion::to<T>(low_data[i].y / sum);
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = conversion::to<T>(high_data[i].x / sum);
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = conversion::to<T>(high_data[i].y / sum);
             }
         }
     }
 }
 
+template <int iterations>
 __global__ void attn_softmax_v2(float* vals,
                                 float* attn_mask,
                                 float* alibi,
@@ -247,7 +268,6 @@ __global__ void attn_softmax_v2(float* vals,
                                 int head_offset,
                                 int mask_stride,
                                 int mp_size,
-                                int iterations,
                                 int reduceWidth)
 {
     cg::thread_block b = cg::this_thread_block();
@@ -269,11 +289,9 @@ __global__ void attn_softmax_v2(float* vals,
         vals += (iter_offset * sequence_length);
 
         int batch_idx = iter_offset / (num_seq * heads);
-        int alibi_offset = batch_idx * heads * mp_size + head_offset;
         int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
         mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
-        int seq_id4 = seq_id >> 2;
 
         int real_seq_id = seq_id + (num_seq == sequence_length ? 0 : sequence_length);
         int window_stride4 = (local_attention && (real_seq_id >> 2) > (window_size >> 2))
@@ -285,58 +303,43 @@ __global__ void attn_softmax_v2(float* vals,
         float max_val = minus_infinity;
 
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-            if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
-                data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    data[i].x = (data_id > window_stride ? vals[data_id] : minus_infinity);
-                    data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride)
-                                    ? vals[data_id + 1]
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
+            bool check = (data_id >> 2) >= window_stride4;
+            bool x_check = check && (data_id < sequence_length) &&
+                           (!triangular || (data_id <= seq_id)) && (data_id > window_stride);
+            bool y_check = check && ((data_id + reduceWidth) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth) <= seq_id)) &&
+                           ((data_id + reduceWidth) > window_stride);
+            bool z_check = check && ((data_id + reduceWidth * 2) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 2) <= seq_id)) &&
+                           ((data_id + reduceWidth * 2) > window_stride);
+            bool w_check = check && ((data_id + reduceWidth * 3) < sequence_length) &&
+                           (!triangular || ((data_id + reduceWidth * 3) <= seq_id)) &&
+                           ((data_id + reduceWidth * 3) > window_stride);
+
+            if (attn_mask) {
+                data[i].x = x_check ? vals[data_id] + attn_mask[data_id + mask_offset]
                                     : minus_infinity;
-                    data[i].z = ((!triangular || ((data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride)
-                                    ? vals[data_id + 2]
+                data[i].y = y_check ? vals[data_id + reduceWidth] +
+                                          attn_mask[data_id + mask_offset + reduceWidth]
                                     : minus_infinity;
-                    data[i].w = ((!triangular || ((data_id + 3) <= seq_id)) &&
-                                 (data_id + 3) > window_stride)
-                                    ? vals[data_id + 3]
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 2]
                                     : minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        data[i].y += attn_mask[data_id + mask_offset + 1];
-                        data[i].z += attn_mask[data_id + mask_offset + 2];
-                        data[i].w += attn_mask[data_id + mask_offset + 3];
-                    }
-                } else {
-                    data[i].x = data_id > window_stride ? vals[data_id] : minus_infinity;
-                    data[i].y = (((!triangular || (data_id + 1) <= seq_id)) &&
-                                 (data_id + 1) > window_stride && (data_id + 1) < sequence_length)
-                                    ? (vals[data_id + 1])
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] +
+                                          attn_mask[data_id + mask_offset + reduceWidth * 3]
                                     : minus_infinity;
-                    data[i].z = (((!triangular || (data_id + 2) <= seq_id)) &&
-                                 (data_id + 2) > window_stride && (data_id + 2) < sequence_length)
-                                    ? (vals[data_id + 2])
-                                    : minus_infinity;
-                    data[i].w = minus_infinity;
-                    if (attn_mask) {
-                        data[i].x += attn_mask[data_id + mask_offset];
-                        if ((data_id + 1) < sequence_length)
-                            data[i].y += attn_mask[data_id + mask_offset + 1];
-                        if ((data_id + 2) < sequence_length)
-                            data[i].z += attn_mask[data_id + mask_offset + 2];
-                    }
-                }
-                max_val = (data[i].x > max_val ? data[i].x : max_val);
-                max_val = (data[i].y > max_val ? data[i].y : max_val);
-                max_val = (data[i].z > max_val ? data[i].z : max_val);
-                max_val = (data[i].w > max_val ? data[i].w : max_val);
             } else {
-                data[i].x = minus_infinity;
-                data[i].y = minus_infinity;
-                data[i].z = minus_infinity;
-                data[i].w = minus_infinity;
+                data[i].x = x_check ? vals[data_id] : minus_infinity;
+                data[i].y = y_check ? vals[data_id + reduceWidth] : minus_infinity;
+                data[i].z = z_check ? vals[data_id + reduceWidth * 2] : minus_infinity;
+                data[i].w = w_check ? vals[data_id + reduceWidth * 3] : minus_infinity;
             }
+
+            max_val = (data[i].x > max_val ? data[i].x : max_val);
+            max_val = (data[i].y > max_val ? data[i].y : max_val);
+            max_val = (data[i].z > max_val ? data[i].z : max_val);
+            max_val = (data[i].w > max_val ? data[i].w : max_val);
         }
 
         for (int i = 1; i < WARP_SIZE; i *= 2) {
@@ -387,24 +390,38 @@ __global__ void attn_softmax_v2(float* vals,
         sum += 1e-6;
 
         for (int i = 0; i < iterations; i++) {
-            int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
-
+            int data_id = i * (reduceWidth << 2) + (seq_lane);
             if (data_id < sequence_length) {
-                if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = data[i].x / sum;
-                    vals[data_id + 1] = data[i].y / sum;
-                    vals[data_id + 2] = data[i].z / sum;
-                    vals[data_id + 3] = data[i].w / sum;
-                } else {
-                    vals[data_id] = data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = data[i].z / sum;
-                }
+                vals[data_id] = data[i].x / sum;
+                if ((data_id + reduceWidth) < sequence_length)
+                    vals[data_id + reduceWidth] = data[i].y / sum;
+                if ((data_id + reduceWidth * 2) < sequence_length)
+                    vals[data_id + reduceWidth * 2] = data[i].z / sum;
+                if ((data_id + reduceWidth * 3) < sequence_length)
+                    vals[data_id + reduceWidth * 3] = data[i].w / sum;
             }
         }
     }
 }
 
+#define LAUNCH_ATTN_SOFTMAX_V2(iterations)                                      \
+    attn_softmax_v2<T, iterations><<<grid, block, 0, stream>>>(vals,            \
+                                                               mask,            \
+                                                               alibi,           \
+                                                               layer_scale,     \
+                                                               triangular,      \
+                                                               recompute,       \
+                                                               local_attention, \
+                                                               window_size,     \
+                                                               total_count,     \
+                                                               heads,           \
+                                                               sequence_length, \
+                                                               num_seq,         \
+                                                               head_offset,     \
+                                                               mask_stride,     \
+                                                               mp_size,         \
+                                                               reduce_width);
+
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
@@ -423,34 +440,50 @@ void launch_attn_softmax_v2(T* vals,
                             int mp_size,
                             cudaStream_t stream)
 {
-    int total_count = batch_size * heads * num_seq;
-    int warp_num = ATTN_THREADS / WARP_SIZE;
-    int reduce_width = ((sequence_length - 1) / ATTN_THREADS + 1);
-    reduce_width = (int)pow(2.0, floor(log2((float)(reduce_width)))) * WARP_SIZE;
-    dim3 grid_dim((total_count - 1) / (ATTN_THREADS / reduce_width) + 1);
-    dim3 block_dim(ATTN_THREADS);
-
-    const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
-
-    if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
-                                                            mask,
-                                                            alibi,
-                                                            layer_scale,
-                                                            triangular,
-                                                            recompute,
-                                                            local_attention,
-                                                            window_size,
-                                                            total_count,
-                                                            heads,
-                                                            sequence_length,
-                                                            num_seq,
-                                                            head_offset,
-                                                            mask_stride,
-                                                            mp_size,
-                                                            iterations,
-                                                            reduce_width);
-    else
+    const int total_count = batch_size * heads * num_seq;
+
+    // Scheduling Overview
+    // 4 element unroll with power of 2 `reduce_width` threads to a ceiling of `attn_threads`
+    // Each block should be partitioned into as many `reduce_width` blocks
+    // as can be fit.
+    constexpr int attn_threads = 256;
+    constexpr int min_reduce_width = hw_warp_size;
+    constexpr int internal_unroll = 4;
+
+    // Handle internal unroll then round to next power of 2. Bump up to minimum granularity.
+    const int thread_steps_rounded =
+        next_pow2((sequence_length + internal_unroll - 1) / internal_unroll);
+    const int thread_steps_schedule =
+        (thread_steps_rounded < min_reduce_width) ? min_reduce_width : thread_steps_rounded;
+    // Bound reduce width to the number of threads
+    const int reduce_width = (thread_steps_schedule < attn_threads) ? thread_steps_schedule
+                                                                    : attn_threads;
+    // Scale for the excess
+    const int iterations = thread_steps_schedule / reduce_width;
+    // Should be safe since reduce_width is capped to attn_threads
+    const int partitions = attn_threads / reduce_width;
+
+    // Launch params
+    dim3 grid((total_count + partitions - 1) / partitions);
+    dim3 block(attn_threads);
+
+    if (sequence_length <= 32768) {
+        if (iterations == 1) {
+            LAUNCH_ATTN_SOFTMAX_V2(1);
+        } else if (iterations == 2) {
+            LAUNCH_ATTN_SOFTMAX_V2(2);
+        } else if (iterations == 4) {
+            LAUNCH_ATTN_SOFTMAX_V2(4);
+        } else if (iterations == 8) {
+            LAUNCH_ATTN_SOFTMAX_V2(8);
+        } else if (iterations == 16) {
+            LAUNCH_ATTN_SOFTMAX_V2(16);
+        } else if (iterations == 32) {
+            LAUNCH_ATTN_SOFTMAX_V2(32);
+        } else if (iterations == 64) {
+            LAUNCH_ATTN_SOFTMAX_V2(64);
+        }
+    } else
         throw std::runtime_error("Unsupport Seq_Length!");
 }
 
@@ -470,6 +503,26 @@ template void launch_attn_softmax_v2(float* vals,
                                      int mask_stride,
                                      int mp_size,
                                      cudaStream_t stream);
+
+#ifdef BF16_AVAILABLE
+template void launch_attn_softmax_v2(__nv_bfloat16* vals,
+                                     __nv_bfloat16* mask,
+                                     __nv_bfloat16* alibi,
+                                     float layer_scale,
+                                     bool triangular,
+                                     bool recompute,
+                                     bool local_attention,
+                                     int window_size,
+                                     int batch_size,
+                                     int heads,
+                                     int num_seq,
+                                     int sequence_length,
+                                     int head_offset,
+                                     int mask_stride,
+                                     int mp_size,
+                                     cudaStream_t stream);
+#endif
+
 template void launch_attn_softmax_v2(__half* vals,
                                      __half* mask,
                                      __half* alibi,
@@ -486,3 +539,53 @@ template void launch_attn_softmax_v2(__half* vals,
                                      int mask_stride,
                                      int mp_size,
                                      cudaStream_t stream);
+
+#define DEF_ATTN_SOFTMAX_V2_HALF(_iter)                                           \
+    template __global__ void attn_softmax_v2<__half, _iter>(__half * vals,        \
+                                                            __half * mask,        \
+                                                            __half * alibi,       \
+                                                            float layer_scale,    \
+                                                            bool triangular,      \
+                                                            bool recompute,       \
+                                                            bool local_attention, \
+                                                            int window_size,      \
+                                                            int total_count,      \
+                                                            int heads,            \
+                                                            int sequence_length,  \
+                                                            int num_seq,          \
+                                                            int head_offset,      \
+                                                            int mask_stride,      \
+                                                            int mp_size,          \
+                                                            int reduceWidth)
+
+#define DEF_ATTN_SOFTMAX_V2_BF16(_iter)                                                   \
+    template __global__ void attn_softmax_v2<__nv_bfloat16, _iter>(__nv_bfloat16 * vals,  \
+                                                                   __nv_bfloat16 * mask,  \
+                                                                   __nv_bfloat16 * alibi, \
+                                                                   float layer_scale,     \
+                                                                   bool triangular,       \
+                                                                   bool recompute,        \
+                                                                   bool local_attention,  \
+                                                                   int window_size,       \
+                                                                   int total_count,       \
+                                                                   int heads,             \
+                                                                   int sequence_length,   \
+                                                                   int num_seq,           \
+                                                                   int head_offset,       \
+                                                                   int mask_stride,       \
+                                                                   int mp_size,           \
+                                                                   int reduceWidth)
+
+#define FOREACH_ITERATIONS(cb) \
+    cb(1);                     \
+    cb(2);                     \
+    cb(4);                     \
+    cb(8);                     \
+    cb(16);                    \
+    cb(32);                    \
+    cb(64)
+
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_HALF);
+#ifdef BF16_AVAILABLE
+FOREACH_ITERATIONS(DEF_ATTN_SOFTMAX_V2_BF16);
+#endif
diff --git a/csrc/transformer/inference/csrc/transform.cu b/csrc/transformer/inference/csrc/transform.cu
index 023e02fe1c5271fba7100d66bb1a802b8759b3b3..00d919039b5d74f30702920f1e89e1e9b8cac2cc 100644
--- a/csrc/transformer/inference/csrc/transform.cu
+++ b/csrc/transformer/inference/csrc/transform.cu
@@ -1,13 +1,20 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
 #endif
+#include "conversion_utils.h"
 #include "inference_cuda_layers.h"
 namespace cg = cooperative_groups;
 
+// only used to avoid compilation error due to lack of definition.
+#ifndef BF16_AVAILABLE
+using __nv_bfloat162 = __half2;
+#endif
+
 // Bias add
 
 __global__ void bias_add_transform_0213(float* output,
@@ -74,11 +81,12 @@ __global__ void bias_add_transform_0213(float* output,
 #define ATTN_H 3
 #define MAX_SEQ_LINE 10
 
-__global__ void bias_add_transform_0213(__half* output,  // q
-                                        __half* k_cache,
-                                        __half* v_cache,
-                                        const __half* vals,  // qkv
-                                        const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,  // q
+                                        T* k_cache,
+                                        T* v_cache,
+                                        const T* vals,  // qkv
+                                        const T* bias,
                                         int hidden_dim,
                                         int seq_length,
                                         unsigned seq_offset,
@@ -90,6 +98,8 @@ __global__ void bias_add_transform_0213(__half* output,  // q
                                         int head_ext,
                                         int max_out_tokens)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     unsigned half_dim = (rotary_dim << 3) >> 1;
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
@@ -107,8 +117,8 @@ __global__ void bias_add_transform_0213(__half* output,  // q
     float4 vals_arr;
     float4 output_arr;
 
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     float4* output_vec =
@@ -128,17 +138,19 @@ __global__ void bias_add_transform_0213(__half* output,  // q
     int lane = d3 & 0x1f;
     if (cnt < 2 && rotary_dim > 0 && d3 < rotary_dim) {
         float4 q = vals_vec[d3];
-        __half2* q_h = reinterpret_cast<__half2*>(&q);
+        T2* q_h = reinterpret_cast<T2*>(&q);
         if (rotate_every_two) {
 #pragma unroll
             for (int o = 0; o < 4; o++) {
                 float inv_freq = (float)(((d3 << 2) + o) * 2) / (float)(rotary_dim << 3);
                 inv_freq = 1.0 / powf(10000.0, inv_freq) * (float)seq_id;
                 float q_data[2];
-                q_data[0] = (float)q_h[o].x;
-                q_data[1] = (float)q_h[o].y;
-                q_h[o].x = (__half)(-1.0 * q_data[1] * sinf(inv_freq) + q_data[0] * cosf(inv_freq));
-                q_h[o].y = (__half)(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
+                q_data[0] = conversion::to<float>(q_h[o].x);
+                q_data[1] = conversion::to<float>(q_h[o].y);
+                q_h[o].x = conversion::to<T>(-1.0 * q_data[1] * sinf(inv_freq) +
+                                             q_data[0] * cosf(inv_freq));
+                q_h[o].y =
+                    conversion::to<T>(q_data[0] * sinf(inv_freq) + q_data[1] * cosf(inv_freq));
             }
         }
         output_vec[d3] = q;
@@ -187,16 +199,17 @@ void launch_bias_add_transform_0213<float>(float* output,
                                                                 head_ext,
                                                                 max_out_tokens);
 }
+
 template <typename T>
-void launch_bias_add_transform_0213(T* outputs,
-                                    T* vals,
-                                    T* vals1,
-                                    const T* vals2,
+void launch_bias_add_transform_0213(T* output,
+                                    T* k_cache,
+                                    T* v_cache,
+                                    const T* vals,
                                     const T* bias,
                                     int batch_size,
                                     int seq_length,
                                     unsigned seq_offset,
-                                    int seq_length1,
+                                    int all_tokens,
                                     int hidden_dim,
                                     int heads,
                                     int rotary_dim,
@@ -204,25 +217,7 @@ void launch_bias_add_transform_0213(T* outputs,
                                     bool rotate_every_two,
                                     cudaStream_t stream,
                                     int trans_count,
-                                    int max_out_tokens);
-template <>
-void launch_bias_add_transform_0213<__half>(__half* output,
-                                            __half* k_cache,
-                                            __half* v_cache,
-                                            const __half* vals,
-                                            const __half* bias,
-                                            int batch_size,
-                                            int seq_length,
-                                            unsigned seq_offset,
-                                            int all_tokens,
-                                            int hidden_dim,
-                                            int heads,
-                                            int rotary_dim,
-                                            bool rotate_half,
-                                            bool rotate_every_two,
-                                            cudaStream_t stream,
-                                            int trans_count,
-                                            int max_out_tokens)
+                                    int max_out_tokens)
 {
     hidden_dim >>= 3;
     int head_ext = 1;  // (hidden_dim - 1) / MAX_THREADS + 1;
@@ -245,6 +240,44 @@ void launch_bias_add_transform_0213<__half>(__half* output,
                                                                 max_out_tokens);
 }
 
+#ifdef BF16_AVAILABLE
+template void launch_bias_add_transform_0213(__nv_bfloat16* output,
+                                             __nv_bfloat16* k_cache,
+                                             __nv_bfloat16* v_cache,
+                                             const __nv_bfloat16* vals,
+                                             const __nv_bfloat16* bias,
+                                             int batch_size,
+                                             int seq_length,
+                                             unsigned seq_offset,
+                                             int all_tokens,
+                                             int hidden_dim,
+                                             int heads,
+                                             int rotary_dim,
+                                             bool rotate_half,
+                                             bool rotate_every_two,
+                                             cudaStream_t stream,
+                                             int trans_count,
+                                             int max_out_tokens);
+#endif
+
+template void launch_bias_add_transform_0213(__half* output,
+                                             __half* k_cache,
+                                             __half* v_cache,
+                                             const __half* vals,
+                                             const __half* bias,
+                                             int batch_size,
+                                             int seq_length,
+                                             unsigned seq_offset,
+                                             int all_tokens,
+                                             int hidden_dim,
+                                             int heads,
+                                             int rotary_dim,
+                                             bool rotate_half,
+                                             bool rotate_every_two,
+                                             cudaStream_t stream,
+                                             int trans_count,
+                                             int max_out_tokens);
+
 // Bias add
 
 __global__ void pad_add_transform_0213(float* output,
@@ -257,17 +290,20 @@ __global__ void pad_add_transform_0213(float* output,
 {
 }
 
-__global__ void pad_add_transform_0213(__half* output,
-                                       const __half* vals,
+template <typename T>
+__global__ void pad_add_transform_0213(T* output,
+                                       const T* vals,
                                        int hidden_dim,
                                        int seq_length,
                                        int padded_seq_len,
                                        int heads,
                                        int padded_head_size)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     float4 ZERO;
-    const __half2 zero_h = __float2half2_rn(0.f);
-    __half2* ZERO_h = reinterpret_cast<__half2*>(&ZERO);
+    const T2 zero_h = conversion::to<T2>(0.f);
+    T2* ZERO_h = reinterpret_cast<T2*>(&ZERO);
 #pragma unroll
     for (int i = 0; i < 4; i++) ZERO_h[i] = zero_h;
 
@@ -300,17 +336,6 @@ __global__ void pad_add_transform_0213(__half* output,
         output_vec[d3] = ZERO;
 }
 
-template <typename T>
-void launch_pad_add_transform_0213(T* output,
-                                   const T* vals,
-                                   int batch_size,
-                                   int hidden_dim,
-                                   int seq_length,
-                                   int padded_seq_len,
-                                   int heads,
-                                   int padded_head_size,
-                                   cudaStream_t stream);
-
 // [B S C*H] - > C * [B A S N]
 template <>
 void launch_pad_add_transform_0213<float>(float* output,
@@ -324,16 +349,17 @@ void launch_pad_add_transform_0213<float>(float* output,
                                           cudaStream_t stream)
 {
 }
-template <>
-void launch_pad_add_transform_0213<__half>(__half* output,
-                                           const __half* vals,
-                                           int batch_size,
-                                           int hidden_dim,
-                                           int seq_length,
-                                           int padded_seq_len,
-                                           int heads,
-                                           int padded_head_size,
-                                           cudaStream_t stream)
+
+template <typename T>
+void launch_pad_add_transform_0213(T* output,
+                                   const T* vals,
+                                   int batch_size,
+                                   int hidden_dim,
+                                   int seq_length,
+                                   int padded_seq_len,
+                                   int heads,
+                                   int padded_head_size,
+                                   cudaStream_t stream)
 {
     hidden_dim >>= 3;
     dim3 block_dim((padded_head_size >> 3), heads, 2);
@@ -342,6 +368,28 @@ void launch_pad_add_transform_0213<__half>(__half* output,
         output, vals, hidden_dim, seq_length, padded_seq_len, heads, padded_head_size >> 3);
 }
 
+#ifdef BF16_AVAILABLE
+template void launch_pad_add_transform_0213(__nv_bfloat16* output,
+                                            const __nv_bfloat16* vals,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            int seq_length,
+                                            int padded_seq_len,
+                                            int heads,
+                                            int padded_head_size,
+                                            cudaStream_t stream);
+#endif
+
+template void launch_pad_add_transform_0213(__half* output,
+                                            const __half* vals,
+                                            int batch_size,
+                                            int hidden_dim,
+                                            int seq_length,
+                                            int padded_seq_len,
+                                            int heads,
+                                            int padded_head_size,
+                                            cudaStream_t stream);
+
 // Bias add
 template <typename T>
 __global__ void bias_add_transform_0213(T* output,
@@ -393,15 +441,17 @@ __global__ void bias_add_transform_0213<float>(float* output,
                d2 * d2_out_stride + d3] = outputs;
 }
 
-template <>
-__global__ void bias_add_transform_0213<__half>(__half* output,
-                                                const __half* vals,
-                                                const __half* bias,
-                                                int hidden_dim,
-                                                int seq_length,
-                                                int heads,
-                                                int head_ext)
+template <typename T>
+__global__ void bias_add_transform_0213(T* output,
+                                        const T* vals,
+                                        const T* bias,
+                                        int hidden_dim,
+                                        int seq_length,
+                                        int heads,
+                                        int head_ext)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     int d0_stride = hidden_dim * seq_length;
     int d1_stride = hidden_dim;
     int d2_stride = hidden_dim / heads;
@@ -417,9 +467,9 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     float4 vals_arr;
     float4 bias_arr;
     float4 output_arr;
-    __half2* vals_half = reinterpret_cast<__half2*>(&vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(&bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(&output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(&vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(&bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(&output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -448,13 +498,16 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
     output_vec[d3] = output_arr;
 }
 
-__global__ void bias_add_transform_0213_v2(__half* output,
-                                           const __half* vals,
-                                           const __half* bias,
+template <typename T>
+__global__ void bias_add_transform_0213_v2(T* output,
+                                           const T* vals,
+                                           const T* bias,
                                            int hidden_dim,
                                            int seq_length,
                                            int heads)
 {
+    using T2 =
+        typename std::conditional<std::is_same<T, __half>::value, __half2, __nv_bfloat162>::type;
     __shared__ float4 in_data[3072];
 
     int d0_stride = hidden_dim * seq_length;
@@ -476,9 +529,9 @@ __global__ void bias_add_transform_0213_v2(__half* output,
     float4 vals_arr[1];
     float4 bias_arr[1];
     float4 output_arr[1];
-    __half2* vals_half = reinterpret_cast<__half2*>(vals_arr);
-    __half2* bias_half = reinterpret_cast<__half2*>(bias_arr);
-    __half2* output_half = reinterpret_cast<__half2*>(output_arr);
+    T2* vals_half = reinterpret_cast<T2*>(vals_arr);
+    T2* bias_half = reinterpret_cast<T2*>(bias_arr);
+    T2* output_half = reinterpret_cast<T2*>(output_arr);
 
     const float4* vals_vec = reinterpret_cast<const float4*>(vals);
     const float4* bias_vec = reinterpret_cast<const float4*>(bias);
@@ -518,6 +571,22 @@ __global__ void bias_add_transform_0213_v2(__half* output,
     }
 }
 
+template __global__ void bias_add_transform_0213_v2(__half* output,
+                                                    const __half* vals,
+                                                    const __half* bias,
+                                                    int hidden_dim,
+                                                    int seq_length,
+                                                    int heads);
+
+#ifdef BF16_AVAILABLE
+template __global__ void bias_add_transform_0213_v2(__nv_bfloat16* output,
+                                                    const __nv_bfloat16* vals,
+                                                    const __nv_bfloat16* bias,
+                                                    int hidden_dim,
+                                                    int seq_length,
+                                                    int heads);
+#endif
+
 template <typename T>
 __global__ void transform4d_0213(T* out,
                                  const T* in,
@@ -559,13 +628,13 @@ __global__ void transform4d_0213<float>(float* out,
     }
 }
 
-template <>
-__global__ void transform4d_0213<__half>(__half* out,
-                                         const __half* in,
-                                         int heads,
-                                         int seq_length,
-                                         int hidden_dim,
-                                         int head_ext)
+template <typename T>
+__global__ void transform4d_0213(T* out,
+                                 const T* in,
+                                 int heads,
+                                 int seq_length,
+                                 int hidden_dim,
+                                 int head_ext)
 {
     int d0_stride = hidden_dim * (seq_length / head_ext);
     int d1_stride = hidden_dim;
@@ -593,11 +662,8 @@ __global__ void transform4d_0213<__half>(__half* out,
     out_vec[d3] = in_vec[d3];
 }
 
-__global__ void transform4d_0213_v2(__half* out,
-                                    const __half* in,
-                                    int heads,
-                                    int seq_length,
-                                    int hidden_dim)
+template <typename T>
+__global__ void transform4d_0213_v2(T* out, const T* in, int heads, int seq_length, int hidden_dim)
 {
     __shared__ float4 in_data[3072];
 
@@ -641,6 +707,20 @@ __global__ void transform4d_0213_v2(__half* out,
     }
 }
 
+#ifdef BF16_AVAILABLE
+template __global__ void transform4d_0213_v2(__nv_bfloat16* out,
+                                             const __nv_bfloat16* in,
+                                             int heads,
+                                             int seq_length,
+                                             int hidden_dim);
+#endif
+
+template __global__ void transform4d_0213_v2(__half* out,
+                                             const __half* in,
+                                             int heads,
+                                             int seq_length,
+                                             int hidden_dim);
+
 // 3 * [B A S N] - > [B S C*H]
 template <>
 void launch_transform4d_0213<float>(float* out,
@@ -659,20 +739,40 @@ void launch_transform4d_0213<float>(float* out,
         <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, 1);
 }
 
-template <>
-void launch_transform4d_0213<__half>(__half* out,
-                                     const __half* in,
-                                     int batch_size,
-                                     int heads,
-                                     int seq_length,
-                                     int hidden_dim,
-                                     cudaStream_t stream,
-                                     int trans_count)
+template <typename T>
+void launch_transform4d_0213<T>(T* out,
+                                const T* in,
+                                int batch_size,
+                                int heads,
+                                int seq_length,
+                                int hidden_dim,
+                                cudaStream_t stream,
+                                int trans_count)
 {
     hidden_dim >>= 3;
     int head_ext = (hidden_dim - 1) / MAX_THREADS + 1;
     dim3 grid_dims(batch_size, trans_count, (seq_length * head_ext));
     dim3 block_dims(hidden_dim / heads, (heads / head_ext));
-    transform4d_0213<__half>
-        <<<grid_dims, block_dims, 0, stream>>>(out, in, heads, seq_length, hidden_dim, head_ext);
+    transform4d_0213<<<grid_dims, block_dims, 0, stream>>>(
+        out, in, heads, seq_length, hidden_dim, head_ext);
 }
+
+#ifdef BF16_AVAILABLE
+template void launch_transform4d_0213(__nv_bfloat16* out,
+                                      const __nv_bfloat16* in,
+                                      int batch_size,
+                                      int heads,
+                                      int seq_length,
+                                      int hidden_dim,
+                                      cudaStream_t stream,
+                                      int trans_count);
+#endif
+
+template void launch_transform4d_0213(__half* out,
+                                      const __half* in,
+                                      int batch_size,
+                                      int heads,
+                                      int seq_length,
+                                      int hidden_dim,
+                                      cudaStream_t stream,
+                                      int trans_count);
diff --git a/csrc/transformer/inference/includes/context.h b/csrc/transformer/inference/includes/context.h
deleted file mode 100644
index 21f0b3cfe07b3f5f519af7a1b3a4daa4f7b88424..0000000000000000000000000000000000000000
--- a/csrc/transformer/inference/includes/context.h
+++ /dev/null
@@ -1,184 +0,0 @@
-#pragma once
-
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime_api.h>
-#include <cassert>
-#include <iostream>
-#include <vector>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-
-#define WARP_SIZE 32
-
-#define CUDA_CHECK(callstr)                                                                    \
-    {                                                                                          \
-        cudaError_t error_code = callstr;                                                      \
-        if (error_code != cudaSuccess) {                                                       \
-            std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
-            assert(0);                                                                         \
-        }                                                                                      \
-    }
-
-#define CUDA_1D_KERNEL_LOOP(i, n) \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
-
-#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                                                          \
-    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
-        for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
-
-#define DS_CUDA_NUM_THREADS 512
-#define DS_MAXIMUM_NUM_BLOCKS 262144
-
-inline int DS_GET_BLOCKS(const int N)
-{
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
-        // Use at least 1 block, since CUDA does not allow empty block
-        1);
-}
-
-class Context {
-public:
-    Context() : _workspace(nullptr), _seed(42), _curr_offset(0), _stream(0)
-    {
-        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
-        curandSetPseudoRandomGeneratorSeed(_gen, 123);
-        if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
-            auto message = std::string("Fail to create cublas handle.");
-            std::cerr << message << std::endl;
-            throw std::runtime_error(message);
-        }
-#ifndef __HIP_PLATFORM_HCC__
-        cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
-        cudaEventCreate(&_comp1_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp2_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comp_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-        cudaEventCreate(&_comm_event, (cudaEventDisableTiming | cudaEventBlockingSync));
-#else
-        cudaEventCreate(&_comp1_event);
-        cudaEventCreate(&_comp2_event);
-        cudaEventCreate(&_comp_event);
-        cudaEventCreate(&_comm_event);
-#endif
-    }
-
-    virtual ~Context()
-    {
-        cublasDestroy(_cublasHandle);
-        cudaFree(_workspace);
-        cudaEventDestroy(_comp1_event);
-        cudaEventDestroy(_comp2_event);
-        cudaEventDestroy(_comp_event);
-        cudaEventDestroy(_comm_event);
-    }
-
-    static Context& Instance()
-    {
-        static Context _ctx;
-        return _ctx;
-    }
-
-    void GenWorkSpace(size_t size)
-    {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
-    }
-
-    cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
-
-    size_t get_workspace_size() const { return _workSpaceSize; }
-    void* GetWorkSpace() { return _workspace; }
-
-    inline unsigned new_token(unsigned layer_id)
-    {
-        if (layer_id == 0) _token_length++;
-        return _token_length;
-    }
-
-    inline void reset_tokens(unsigned initial_tokens = 0)
-    {
-        _num_tokens = initial_tokens;
-    }  //_token_length = 0; }
-
-    inline unsigned current_tokens() const { return _num_tokens; }
-
-    inline void advance_tokens() { _num_tokens++; }
-
-    curandGenerator_t& GetRandGenerator() { return _gen; }
-
-    cudaStream_t GetCommStream(bool async_op = false)
-    {
-        if (!_comm_stream)
-            _comm_stream = async_op ? at::cuda::getStreamFromPool(true)
-                                    : at::cuda::getCurrentCUDAStream();
-        return _comm_stream;
-    }
-    cudaStream_t GetCurrentStream(bool other_stream = false)
-    {
-        // get current pytorch stream.
-        if (other_stream) {
-            if (!_stream) _stream = at::cuda::getStreamFromPool(true);
-            return _stream;
-        }
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        return stream;
-    }
-
-    cublasHandle_t GetCublasHandle() { return _cublasHandle; }
-
-    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
-    {
-        uint64_t offset = _curr_offset;
-        _curr_offset += offset_inc;
-        return std::pair<uint64_t, uint64_t>(_seed, offset);
-    }
-
-    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
-
-    const std::vector<std::array<int, 3>>& GetGemmAlgos() const { return _gemm_algos; }
-
-    inline void SynchComp()
-    {
-        cudaEventRecord(_comp_event, _comp_stream);
-        cudaStreamWaitEvent(_comm_stream, _comp_event, 0);
-    }
-    inline void SynchComm()
-    {
-        cudaEventRecord(_comm_event, _comm_stream);
-        cudaStreamWaitEvent(_comp_stream, _comm_event, 0);
-    }
-
-private:
-    curandGenerator_t _gen;
-    cublasHandle_t _cublasHandle;
-
-    cudaEvent_t _comp_event;
-    cudaEvent_t _comm_event;
-
-    void* _workspace;
-    uint64_t _seed;
-    uint64_t _curr_offset;
-    size_t _workSpaceSize;
-
-    cudaEvent_t _comp1_event;
-    cudaEvent_t _comp2_event;
-
-    cudaStream_t _stream;
-
-    unsigned _token_length;
-    unsigned _num_tokens;
-    std::vector<std::array<int, 3>> _gemm_algos;
-
-    cudaStream_t _comp_stream;
-    cudaStream_t _comm_stream;
-
-    std::unordered_map<int, int> _world_sizes;
-};
diff --git a/csrc/transformer/inference/includes/cublas_wrappers.h b/csrc/transformer/inference/includes/cublas_wrappers.h
deleted file mode 100644
index 75d18a40fc8e468c3ddcc5b1ae8bbdfc421c7072..0000000000000000000000000000000000000000
--- a/csrc/transformer/inference/includes/cublas_wrappers.h
+++ /dev/null
@@ -1,413 +0,0 @@
-#pragma once
-
-#include <assert.h>
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
-#include <mma.h>
-#endif
-#include <stdio.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const float* A,
-                   const float* B,
-                   float* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f32_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f32_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            C,
-                                            rocblas_datatype_f32_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_32F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_32F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         C,
-                                         CUDA_R_32F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_gemm_ex(rocblas_handle handle,
-                   rocblas_operation transa,
-                   rocblas_operation transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   rocblas_gemm_algo algo)
-#else
-int cublas_gemm_ex(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float* alpha,
-                   const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
-                   cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status = rocblas_gemm_ex(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            (const void*)alpha,
-                                            (const void*)A,
-                                            rocblas_datatype_f16_r,
-                                            (transa == rocblas_operation_none) ? m : k,
-                                            (const void*)B,
-                                            rocblas_datatype_f16_r,
-                                            (transb == rocblas_operation_none) ? k : n,
-                                            (const void*)beta,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            (void*)C,
-                                            rocblas_datatype_f16_r,
-                                            m,
-                                            rocblas_datatype_f32_r,
-                                            algo,
-                                            0,
-                                            0);
-#else
-    cublasStatus_t status = cublasGemmEx(handle,
-                                         transa,
-                                         transb,
-                                         m,
-                                         n,
-                                         k,
-                                         (const void*)alpha,
-                                         (const void*)A,
-                                         CUDA_R_16F,
-                                         (transa == CUBLAS_OP_N) ? m : k,
-                                         (const void*)B,
-                                         CUDA_R_16F,
-                                         (transb == CUBLAS_OP_N) ? k : n,
-                                         (const void*)beta,
-                                         (void*)C,
-                                         CUDA_R_16F,
-                                         m,
-                                         CUDA_R_32F,
-                                         algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const float* A,
-                                const float* B,
-                                float* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f32_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f32_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f32_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_32F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_32F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_32F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
-                batch,
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-    return 0;
-}
-
-#ifdef __HIP_PLATFORM_HCC__
-int cublas_strided_batched_gemm(rocblas_handle handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                rocblas_operation op_A,
-                                rocblas_operation op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                rocblas_gemm_algo algo)
-#else
-int cublas_strided_batched_gemm(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                int k,
-                                const float* alpha,
-                                const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
-                                cublasOperation_t op_A,
-                                cublasOperation_t op_B,
-                                int stride_A,
-                                int stride_B,
-                                int stride_C,
-                                int batch,
-                                cublasGemmAlgo_t algo)
-#endif
-{
-#ifdef __HIP_PLATFORM_HCC__
-    rocblas_status status =
-        rocblas_gemm_strided_batched_ex(handle,
-                                        op_A,
-                                        op_B,
-                                        m,
-                                        n,
-                                        k,
-                                        alpha,
-                                        A,
-                                        rocblas_datatype_f16_r,
-                                        (op_A == rocblas_operation_none) ? m : k,
-                                        stride_A,
-                                        B,
-                                        rocblas_datatype_f16_r,
-                                        (op_B == rocblas_operation_none) ? k : n,
-                                        stride_B,
-                                        beta,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        C,
-                                        rocblas_datatype_f16_r,
-                                        m,
-                                        stride_C,
-                                        batch,
-                                        rocblas_datatype_f32_r,
-                                        algo,
-                                        0,
-                                        0);
-#else
-    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
-                                                       op_A,
-                                                       op_B,
-                                                       m,
-                                                       n,
-                                                       k,
-                                                       alpha,
-                                                       A,
-                                                       CUDA_R_16F,
-                                                       (op_A == CUBLAS_OP_N) ? m : k,
-                                                       stride_A,
-                                                       B,
-                                                       CUDA_R_16F,
-                                                       (op_B == CUBLAS_OP_N) ? k : n,
-                                                       stride_B,
-                                                       beta,
-                                                       C,
-                                                       CUDA_R_16F,
-                                                       m,
-                                                       stride_C,
-                                                       batch,
-                                                       CUDA_R_32F,
-                                                       algo);
-#endif
-
-#ifdef __HIP_PLATFORM_HCC__
-    if (status != rocblas_status_success) {
-#else
-    if (status != CUBLAS_STATUS_SUCCESS) {
-#endif
-        fprintf(stderr,
-                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
-                m,
-                n,
-                k,
-                (int)status);
-        return EXIT_FAILURE;
-    }
-
-    return 0;
-}
diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
deleted file mode 100644
index 06b4340061c98c65b4b301c7349d2da03185f715..0000000000000000000000000000000000000000
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#pragma once
-
-#ifdef __HIP_PLATFORM_HCC__
-#define HALF_PRECISION_AVAILABLE = 1
-#include <hip/hip_cooperative_groups.h>
-#else
-#if __CUDA_ARCH__ >= 700
-#define HALF_PRECISION_AVAILABLE = 1
-#endif
-#include <cooperative_groups.h>
-#endif
-
-#include <cuda.h>
-#include <cuda_fp16.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <cassert>
-#include <iostream>
-
-#define MAX_WARP_NUM 32
-#define WARP_SIZE 32
-#define SMs 80
-
-#define MAX_REGISTERS 256
-template <typename T>
-void launch_attn_softmax_v2(T* vals,
-                            T* mask,
-                            bool triangular,
-                            bool recompute,
-                            bool local_attention,
-                            int window_size,
-                            int batch_size,
-                            int heads,
-                            int num_seq,
-                            int sequence_length,
-                            float scale,
-                            cudaStream_t stream);
-
-// Fused bias add with gelu activation
-template <typename T>
-void launch_bias_gelu(T* input,
-                      const T* bias,
-                      int intermediate_size,
-                      int batch_size,
-                      cudaStream_t stream);
-template <typename T>
-void launch_bias_add(T* input, const T* bias, int hidden_size, int batch_size, cudaStream_t stream);
-
-template <typename T>
-void launch_bias_residual(T* input,
-                          T* output,
-                          T* attn,
-                          T* bias,
-                          T* attn_bias,
-                          int batch,
-                          int hidden_dim,
-                          int mp_size,
-                          cudaStream_t stream);
-
-template <typename T>
-void launch_layer_norm(T* out,
-                       T* vals,
-                       const T* gamma,
-                       const T* beta,
-                       float epsilon,
-                       int batch_size,
-                       int hidden_dim,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_residual_layer_norm(T* norm,
-                                T* res_add,
-                                T* vals,
-                                T* residual,
-                                const T* bias,
-                                const T* gamma,
-                                const T* beta,
-                                float epsilon,
-                                int batch_size,
-                                int hidden_dim,
-                                bool preLN,
-                                bool mlp_after_attn,
-                                cudaStream_t stream);
-template <typename T>
-void launch_dequantize(T* output,
-                       const int8_t* input,
-                       const float* qscale,
-                       unsigned output_size,
-                       unsigned hidden_dim,
-                       unsigned groups,
-                       unsigned merge_count,
-                       cudaStream_t stream);
-
-template <typename T>
-void launch_gptj_residual_add(T* input,
-                              T* output,
-                              T* attn,
-                              T* bias,
-                              T* attn_bias,
-                              int batch,
-                              int head_size,
-                              int mp_size,
-                              cudaStream_t stream);
-
-template <typename T>
-void launch_apply_rotary_pos_emb(T* mixed_query,
-                                 T* key_layer,
-                                 unsigned head_size,
-                                 unsigned seq_len,
-                                 unsigned rotary_dim,
-                                 unsigned offset,
-                                 unsigned num_heads,
-                                 unsigned batch,
-                                 bool rotate_half,
-                                 bool rotate_every_two,
-                                 cudaStream_t stream);
-
-template <typename T>
-void launch_moe_res_matmul(T* residual,
-                           T* coef,
-                           T* mlp_out,
-                           int seq_len,
-                           int hidden_dim,
-                           cudaStream_t stream);
diff --git a/csrc/transformer/inference/includes/inference_context.h b/csrc/transformer/inference/includes/inference_context.h
index b3851ca43b725dda388de7adc86048282904a176..f7bbcad91e2afb1c5439f0a8b32dbbcf06474df0 100644
--- a/csrc/transformer/inference/includes/inference_context.h
+++ b/csrc/transformer/inference/includes/inference_context.h
@@ -1,6 +1,7 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
@@ -45,17 +46,20 @@ inline int DS_GET_BLOCKS(const int N)
         1);
 }
 
-class Context {
+class InferenceContext {
 public:
-    Context()
+    InferenceContext()
         : _workspace(nullptr),
           _seed(42),
           _curr_offset(0),
           _stream(0),
           _free_memory_size(0),
           _num_tokens(1),
-          _attention_unfused_workspace_offset(0)
+          _attention_unfused_workspace_offset(0),
+          _workSpaceSize(0)
     {
+        _workSpaceSize = 0;
+        _workspace = 0;
         if (cublasCreate(&_cublasHandle) != CUBLAS_STATUS_SUCCESS) {
             auto message = std::string("Fail to create cublas handle.");
             std::cerr << message << std::endl;
@@ -70,7 +74,7 @@ public:
         cudaEventCreate(&_comm_event);
     }
 
-    virtual ~Context()
+    virtual ~InferenceContext()
     {
         cublasDestroy(_cublasHandle);
         cudaFree(_workspace);
@@ -80,9 +84,9 @@ public:
         cudaEventDestroy(_comm_event);
     }
 
-    static Context& Instance()
+    static InferenceContext& Instance()
     {
-        static Context _ctx;
+        static InferenceContext _ctx;
         return _ctx;
     }
 
@@ -95,7 +99,8 @@ public:
                       const bool& external_cache,
                       const size_t& elem_size,
                       const unsigned& rank,
-                      unsigned max_out_tokens)
+                      unsigned max_out_tokens,
+                      unsigned min_out_tokens)
     {
         size_t total_size;
         if (!_free_memory_size) { cudaMemGetInfo(&_free_memory_size, &total_size); }
@@ -106,9 +111,9 @@ public:
         const int padded_head_size = head_size <= 32 ? 32 : (head_size <= 64 ? 64 : 128);
         const int effective_head_size = (head_size > 128) ? head_size : padded_head_size;
 
-        size_t activation_size = 16 * (num_heads * effective_head_size) * batch_size;
+        size_t activation_size = 10 * (num_heads * effective_head_size) * batch_size;
         // Other sequence length dimension is added when the final workSpaceSize is calculated
-        size_t temp_size = batch_size * num_heads * max_out_tokens * 2;
+        size_t temp_size = batch_size * (num_heads / mp_size) * max_out_tokens;
         size_t cache_size =
             num_layers * batch_size * ((num_heads * effective_head_size) / mp_size) * 2;
         size_t minimal_requirements =
@@ -128,25 +133,37 @@ public:
                                                 : (activation_size + temp_size + cache_size))) *
                                _max_seq_len * elem_size;
         temp_size *= _max_seq_len * elem_size;
-        if (rank == 0 && !_workspace)
+
+        if (_max_seq_len < min_out_tokens) {
+            printf(
+                "Allocatable workspace available (%d tokens) is less than minimum requested "
+                "workspace (%d tokens)\n",
+                _max_seq_len,
+                min_out_tokens);
+            throw std::runtime_error("Workspace can't be allocated, not enough memory");
+        }
+
+        if (!_workspace) {
+            assert(_workspace == nullptr);
+            cudaMalloc(&_workspace, workSpaceSize);
+        } else if (_workSpaceSize < workSpaceSize) {
+            cudaFree(_workspace);
+            cudaMalloc(&_workspace, workSpaceSize);
+        }
+        if (rank == 0 && (!_workspace || _workSpaceSize < workSpaceSize))
             printf(
                 "------------------------------------------------------\n"
                 "Free memory : %f (GigaBytes)  \n"
                 "Total memory: %f (GigaBytes)  \n"
                 "Requested memory: %f (GigaBytes) \n"
                 "Setting maximum total tokens (input + output) to %lu \n"
+                "WorkSpace: %p \n"
                 "------------------------------------------------------\n",
                 (float)_free_memory_size / GIGABYTE,
                 (float)total_size / GIGABYTE,
                 (float)workSpaceSize / GIGABYTE,
-                _max_seq_len);
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, workSpaceSize);
-        } else if (_workSpaceSize < workSpaceSize) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, workSpaceSize);
-        }
+                _max_seq_len,
+                _workspace);
 
         if (!_workspace) {
             printf("Requested:\t%lu\nFree:\t%lu\nTotal:\t%lu\n",
@@ -202,6 +219,17 @@ public:
         return stream;
     }
 
+    void release_workspace()
+    {
+        cudaFree(_workspace);
+        _workspace = nullptr;
+    }
+    bool retake_workspace()
+    {
+        if (_workspace != nullptr || _workSpaceSize == 0) return true;
+        cudaMalloc(&_workspace, _workSpaceSize);
+        return _workspace != nullptr;
+    }
     cublasHandle_t GetCublasHandle() { return _cublasHandle; }
 
     std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
diff --git a/csrc/transformer/inference/includes/inference_cublas_wrappers.h b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
index 9e55cc1c7423ade02b681ec900e107aeb7a50a8c..38a0f7ab66d20e1f0753d3c27a61d88e3b7b6dbd 100644
--- a/csrc/transformer/inference/includes/inference_cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
@@ -1,12 +1,14 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include <assert.h>
 #include <cublas_v2.h>
 #include <cuda.h>
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #ifndef __HIP_PLATFORM_HCC__
@@ -105,6 +107,7 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
+template <typename T>
 #ifdef __HIP_PLATFORM_HCC__
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
@@ -114,9 +117,9 @@ int cublas_gemm_ex(rocblas_handle handle,
                    int k,
                    const float* alpha,
                    const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
+                   const T* A,
+                   const T* B,
+                   T* C,
                    rocblas_gemm_algo algo)
 #else
 int cublas_gemm_ex(cublasHandle_t handle,
@@ -127,13 +130,15 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    int k,
                    const float* alpha,
                    const float* beta,
-                   const __half* A,
-                   const __half* B,
-                   __half* C,
+                   const T* A,
+                   const T* B,
+                   T* C,
                    cublasGemmAlgo_t algo)
 #endif
 {
 #ifdef __HIP_PLATFORM_HCC__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -142,23 +147,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                             k,
                                             (const void*)alpha,
                                             (const void*)A,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             (transa == rocblas_operation_none) ? m : k,
                                             (const void*)B,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             (transb == rocblas_operation_none) ? k : n,
                                             (const void*)beta,
                                             (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             m,
                                             (void*)C,
-                                            rocblas_datatype_f16_r,
+                                            rocblas_dtype_16,
                                             m,
                                             rocblas_datatype_f32_r,
                                             algo,
                                             0,
                                             0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -167,14 +173,14 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          k,
                                          (const void*)alpha,
                                          (const void*)A,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                          (transb == CUBLAS_OP_N) ? k : n,
                                          (const void*)beta,
                                          (void*)C,
-                                         CUDA_R_16F,
+                                         cublas_dtype_16,
                                          m,
                                          CUDA_R_32F,
                                          algo);
@@ -306,6 +312,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
     return 0;
 }
 
+template <typename T>
 #ifdef __HIP_PLATFORM_HCC__
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
@@ -313,9 +320,9 @@ int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int k,
                                 const float* alpha,
                                 const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                 rocblas_operation op_A,
                                 rocblas_operation op_B,
                                 int stride_A,
@@ -330,9 +337,9 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int k,
                                 const float* alpha,
                                 const float* beta,
-                                const __half* A,
-                                const __half* B,
-                                __half* C,
+                                const T* A,
+                                const T* B,
+                                T* C,
                                 cublasOperation_t op_A,
                                 cublasOperation_t op_B,
                                 int stride_A,
@@ -343,6 +350,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
 #endif
 {
 #ifdef __HIP_PLATFORM_HCC__
+    constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
+                                                                     : rocblas_datatype_bf16_r;
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -352,20 +361,20 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                         k,
                                         alpha,
                                         A,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         (op_A == rocblas_operation_none) ? m : k,
                                         stride_A,
                                         B,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         (op_B == rocblas_operation_none) ? k : n,
                                         stride_B,
                                         beta,
                                         C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         m,
                                         stride_C,
                                         C,
-                                        rocblas_datatype_f16_r,
+                                        rocblas_dtype_16,
                                         m,
                                         stride_C,
                                         batch,
@@ -374,6 +383,7 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                         0,
                                         0);
 #else
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -382,16 +392,16 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        k,
                                                        alpha,
                                                        A,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        (op_A == CUBLAS_OP_N) ? m : k,
                                                        stride_A,
                                                        B,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        (op_B == CUBLAS_OP_N) ? k : n,
                                                        stride_B,
                                                        beta,
                                                        C,
-                                                       CUDA_R_16F,
+                                                       cublas_dtype_16,
                                                        m,
                                                        stride_C,
                                                        batch,
diff --git a/csrc/transformer/inference/includes/inference_cuda_layers.h b/csrc/transformer/inference/includes/inference_cuda_layers.h
index 211f56bc8a76b68e2f58a005a2523a7d2d1dfa81..e7e835a2dddf68f5b81b9edb33a639a92d54c2a0 100644
--- a/csrc/transformer/inference/includes/inference_cuda_layers.h
+++ b/csrc/transformer/inference/includes/inference_cuda_layers.h
@@ -1,12 +1,16 @@
-/*
-Copyright 2022 The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #pragma once
 
 #include "ds_kernel_utils.h"
 
 #include <cuda.h>
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
 #include <cuda_fp16.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/csrc/transformer/normalize_kernels.cu b/csrc/transformer/normalize_kernels.cu
index 84f27b2c43af9eeb44b2fa353411d62f74e6830b..8b9dbaea6c86d98ed7e431073684b89b8d1499ee 100644
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
index bad814b8629d236753e134731e461cfaca1d88ff..d9efed286462f1be971922c2345efb8d2049e227 100644
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include <math.h>
 #include "custom_cuda_layers.h"
diff --git a/csrc/transformer/transform_kernels.cu b/csrc/transformer/transform_kernels.cu
old mode 100755
new mode 100644
index 37a6e79bb54556ffb3b5bb81c881e6ffe44f1438..dbecc31536c6c9355c4a3e243f2a0219bc476e6e
--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@@ -1,6 +1,7 @@
-/*
-Copyright The Microsoft DeepSpeed Team
-*/
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
 
 #include "custom_cuda_layers.h"
 
diff --git a/csrc/utils/flatten_unflatten.cpp b/csrc/utils/flatten_unflatten.cpp
index 7d16c5c00a5f285a5320aa09bdf74de4aa5d2611..ab95ee1914647c85e3e0ad5ebf625877d57a5ad6 100644
--- a/csrc/utils/flatten_unflatten.cpp
+++ b/csrc/utils/flatten_unflatten.cpp
@@ -1,7 +1,11 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
 /*
-   Copyright 2020 The Microsoft DeepSpeed Team
-   Copyright NVIDIA/apex
-   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 */
 
 #include <torch/csrc/utils/tensor_flatten.h>
diff --git a/deepspeed.git/HEAD b/deepspeed.git/HEAD
deleted file mode 100644
index 402fa7c3ef324eb73bfee45604a88c85802e3980..0000000000000000000000000000000000000000
--- a/deepspeed.git/HEAD
+++ /dev/null
@@ -1 +0,0 @@
-ref: refs/heads/ds-v0.8.2-rocm
diff --git a/deepspeed.git/config b/deepspeed.git/config
deleted file mode 100644
index b30034b6e5d26d425d1eb023f6d8fe1ec60de31c..0000000000000000000000000000000000000000
--- a/deepspeed.git/config
+++ /dev/null
@@ -1,8 +0,0 @@
-[core]
-	repositoryformatversion = 0
-	filemode = true
-	bare = true
-[remote "origin"]
-	url = http://developer.hpccube.com/codes/aicomponent/deepspeed.git
-	fetch = +refs/*:refs/*
-	mirror = true
diff --git a/deepspeed.git/description b/deepspeed.git/description
deleted file mode 100644
index 498b267a8c7812490d6479839c5577eaaec79d62..0000000000000000000000000000000000000000
--- a/deepspeed.git/description
+++ /dev/null
@@ -1 +0,0 @@
-Unnamed repository; edit this file 'description' to name the repository.
diff --git a/deepspeed.git/hooks/applypatch-msg.sample b/deepspeed.git/hooks/applypatch-msg.sample
deleted file mode 100755
index 8b2a2fe84feaeaba56953d6d4d0d649b3cf755eb..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/applypatch-msg.sample
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to check the commit log message taken by
-# applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.  The hook is
-# allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "applypatch-msg".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/commit-msg" &&
-	exec "$GIT_DIR/hooks/commit-msg" ${1+"$@"}
-:
diff --git a/deepspeed.git/hooks/commit-msg.sample b/deepspeed.git/hooks/commit-msg.sample
deleted file mode 100755
index b58d1184a9d43a39c0d95f32453efc78581877d6..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/commit-msg.sample
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to check the commit log message.
-# Called by "git commit" with one argument, the name of the file
-# that has the commit message.  The hook should exit with non-zero
-# status after issuing an appropriate message if it wants to stop the
-# commit.  The hook is allowed to edit the commit message file.
-#
-# To enable this hook, rename this file to "commit-msg".
-
-# Uncomment the below to add a Signed-off-by line to the message.
-# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
-# hook is more suited to it.
-#
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
-
-# This example catches duplicate Signed-off-by lines.
-
-test "" = "$(grep '^Signed-off-by: ' "$1" |
-	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
-	echo >&2 Duplicate Signed-off-by lines.
-	exit 1
-}
diff --git a/deepspeed.git/hooks/post-update.sample b/deepspeed.git/hooks/post-update.sample
deleted file mode 100755
index ec17ec1939b7c3e86b7cb6c0c4de6b0818a7e75e..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/post-update.sample
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to prepare a packed repository for use over
-# dumb transports.
-#
-# To enable this hook, rename this file to "post-update".
-
-exec git update-server-info
diff --git a/deepspeed.git/hooks/pre-applypatch.sample b/deepspeed.git/hooks/pre-applypatch.sample
deleted file mode 100755
index b1f187c2e9acaba942639bca90a63c5b4f058967..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/pre-applypatch.sample
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed
-# by applypatch from an e-mail message.
-#
-# The hook should exit with non-zero status after issuing an
-# appropriate message if it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-applypatch".
-
-. git-sh-setup
-test -x "$GIT_DIR/hooks/pre-commit" &&
-	exec "$GIT_DIR/hooks/pre-commit" ${1+"$@"}
-:
diff --git a/deepspeed.git/hooks/pre-commit.sample b/deepspeed.git/hooks/pre-commit.sample
deleted file mode 100755
index 18c48297652174ffae65b877dd131711a5746181..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/pre-commit.sample
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to verify what is about to be committed.
-# Called by "git commit" with no arguments.  The hook should
-# exit with non-zero status after issuing an appropriate message if
-# it wants to stop the commit.
-#
-# To enable this hook, rename this file to "pre-commit".
-
-if git rev-parse --verify HEAD >/dev/null 2>&1
-then
-	against=HEAD
-else
-	# Initial commit: diff against an empty tree object
-	against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
-fi
-
-# If you want to allow non-ascii filenames set this variable to true.
-allownonascii=$(git config hooks.allownonascii)
-
-# Redirect output to stderr.
-exec 1>&2
-
-# Cross platform projects tend to avoid non-ascii filenames; prevent
-# them from being added to the repository. We exploit the fact that the
-# printable range starts at the space character and ends with tilde.
-if [ "$allownonascii" != "true" ] &&
-	# Note that the use of brackets around a tr range is ok here, (it's
-	# even required, for portability to Solaris 10's /usr/bin/tr), since
-	# the square bracket bytes happen to fall in the designated range.
-	test $(git diff --cached --name-only --diff-filter=A -z $against |
-	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
-then
-	echo "Error: Attempt to add a non-ascii file name."
-	echo
-	echo "This can cause problems if you want to work"
-	echo "with people on other platforms."
-	echo
-	echo "To be portable it is advisable to rename the file ..."
-	echo
-	echo "If you know what you are doing you can disable this"
-	echo "check using:"
-	echo
-	echo "  git config hooks.allownonascii true"
-	echo
-	exit 1
-fi
-
-# If there are whitespace errors, print the offending file names and fail.
-exec git diff-index --check --cached $against --
diff --git a/deepspeed.git/hooks/pre-push.sample b/deepspeed.git/hooks/pre-push.sample
deleted file mode 100644
index 15ab6d8e7ea05ca54f0b755828e166fafe9bc28c..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/pre-push.sample
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/sh
-
-# An example hook script to verify what is about to be pushed.  Called by "git
-# push" after it has checked the remote status, but before anything has been
-# pushed.  If this script exits with a non-zero status nothing will be pushed.
-#
-# This hook is called with the following parameters:
-#
-# $1 -- Name of the remote to which the push is being done
-# $2 -- URL to which the push is being done
-#
-# If pushing without using a named remote those arguments will be equal.
-#
-# Information about the commits which are being pushed is supplied as lines to
-# the standard input in the form:
-#
-#   <local ref> <local sha1> <remote ref> <remote sha1>
-#
-# This sample shows how to prevent push of commits where the log message starts
-# with "WIP" (work in progress).
-
-remote="$1"
-url="$2"
-
-z40=0000000000000000000000000000000000000000
-
-IFS=' '
-while read local_ref local_sha remote_ref remote_sha
-do
-	if [ "$local_sha" = $z40 ]
-	then
-		# Handle delete
-	else
-		if [ "$remote_sha" = $z40 ]
-		then
-			# New branch, examine all commits
-			range="$local_sha"
-		else
-			# Update to existing branch, examine new commits
-			range="$remote_sha..$local_sha"
-		fi
-
-		# Check for WIP commit
-		commit=`git rev-list -n 1 --grep '^WIP' "$range"`
-		if [ -n "$commit" ]
-		then
-			echo "Found WIP commit in $local_ref, not pushing"
-			exit 1
-		fi
-	fi
-done
-
-exit 0
diff --git a/deepspeed.git/hooks/pre-rebase.sample b/deepspeed.git/hooks/pre-rebase.sample
deleted file mode 100755
index 9773ed4cb298f6883adcd1269de5c9f97fc6d8fe..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/pre-rebase.sample
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/bin/sh
-#
-# Copyright (c) 2006, 2008 Junio C Hamano
-#
-# The "pre-rebase" hook is run just before "git rebase" starts doing
-# its job, and can prevent the command from running by exiting with
-# non-zero status.
-#
-# The hook is called with the following parameters:
-#
-# $1 -- the upstream the series was forked from.
-# $2 -- the branch being rebased (or empty when rebasing the current branch).
-#
-# This sample shows how to prevent topic branches that are already
-# merged to 'next' branch from getting rebased, because allowing it
-# would result in rebasing already published history.
-
-publish=next
-basebranch="$1"
-if test "$#" = 2
-then
-	topic="refs/heads/$2"
-else
-	topic=`git symbolic-ref HEAD` ||
-	exit 0 ;# we do not interrupt rebasing detached HEAD
-fi
-
-case "$topic" in
-refs/heads/??/*)
-	;;
-*)
-	exit 0 ;# we do not interrupt others.
-	;;
-esac
-
-# Now we are dealing with a topic branch being rebased
-# on top of master.  Is it OK to rebase it?
-
-# Does the topic really exist?
-git show-ref -q "$topic" || {
-	echo >&2 "No such branch $topic"
-	exit 1
-}
-
-# Is topic fully merged to master?
-not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
-if test -z "$not_in_master"
-then
-	echo >&2 "$topic is fully merged to master; better remove it."
-	exit 1 ;# we could allow it, but there is no point.
-fi
-
-# Is topic ever merged to next?  If so you should not be rebasing it.
-only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
-only_next_2=`git rev-list ^master           ${publish} | sort`
-if test "$only_next_1" = "$only_next_2"
-then
-	not_in_topic=`git rev-list "^$topic" master`
-	if test -z "$not_in_topic"
-	then
-		echo >&2 "$topic is already up-to-date with master"
-		exit 1 ;# we could allow it, but there is no point.
-	else
-		exit 0
-	fi
-else
-	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
-	/usr/bin/perl -e '
-		my $topic = $ARGV[0];
-		my $msg = "* $topic has commits already merged to public branch:\n";
-		my (%not_in_next) = map {
-			/^([0-9a-f]+) /;
-			($1 => 1);
-		} split(/\n/, $ARGV[1]);
-		for my $elem (map {
-				/^([0-9a-f]+) (.*)$/;
-				[$1 => $2];
-			} split(/\n/, $ARGV[2])) {
-			if (!exists $not_in_next{$elem->[0]}) {
-				if ($msg) {
-					print STDERR $msg;
-					undef $msg;
-				}
-				print STDERR " $elem->[1]\n";
-			}
-		}
-	' "$topic" "$not_in_next" "$not_in_master"
-	exit 1
-fi
-
-exit 0
-
-################################################################
-
-This sample hook safeguards topic branches that have been
-published from being rewound.
-
-The workflow assumed here is:
-
- * Once a topic branch forks from "master", "master" is never
-   merged into it again (either directly or indirectly).
-
- * Once a topic branch is fully cooked and merged into "master",
-   it is deleted.  If you need to build on top of it to correct
-   earlier mistakes, a new topic branch is created by forking at
-   the tip of the "master".  This is not strictly necessary, but
-   it makes it easier to keep your history simple.
-
- * Whenever you need to test or publish your changes to topic
-   branches, merge them into "next" branch.
-
-The script, being an example, hardcodes the publish branch name
-to be "next", but it is trivial to make it configurable via
-$GIT_DIR/config mechanism.
-
-With this workflow, you would want to know:
-
-(1) ... if a topic branch has ever been merged to "next".  Young
-    topic branches can have stupid mistakes you would rather
-    clean up before publishing, and things that have not been
-    merged into other branches can be easily rebased without
-    affecting other people.  But once it is published, you would
-    not want to rewind it.
-
-(2) ... if a topic branch has been fully merged to "master".
-    Then you can delete it.  More importantly, you should not
-    build on top of it -- other people may already want to
-    change things related to the topic as patches against your
-    "master", so if you need further changes, it is better to
-    fork the topic (perhaps with the same name) afresh from the
-    tip of "master".
-
-Let's look at this example:
-
-		   o---o---o---o---o---o---o---o---o---o "next"
-		  /       /           /           /
-		 /   a---a---b A     /           /
-		/   /               /           /
-	       /   /   c---c---c---c B         /
-	      /   /   /             \         /
-	     /   /   /   b---b C     \       /
-	    /   /   /   /             \     /
-    ---o---o---o---o---o---o---o---o---o---o---o "master"
-
-
-A, B and C are topic branches.
-
- * A has one fix since it was merged up to "next".
-
- * B has finished.  It has been fully merged up to "master" and "next",
-   and is ready to be deleted.
-
- * C has not merged to "next" at all.
-
-We would want to allow C to be rebased, refuse A, and encourage
-B to be deleted.
-
-To compute (1):
-
-	git rev-list ^master ^topic next
-	git rev-list ^master        next
-
-	if these match, topic has not merged in next at all.
-
-To compute (2):
-
-	git rev-list master..topic
-
-	if this is empty, it is fully merged to "master".
diff --git a/deepspeed.git/hooks/prepare-commit-msg.sample b/deepspeed.git/hooks/prepare-commit-msg.sample
deleted file mode 100755
index f093a02ec49918ab15e920f455979fd5ed732cf6..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/prepare-commit-msg.sample
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to prepare the commit log message.
-# Called by "git commit" with the name of the file that has the
-# commit message, followed by the description of the commit
-# message's source.  The hook's purpose is to edit the commit
-# message file.  If the hook fails with a non-zero status,
-# the commit is aborted.
-#
-# To enable this hook, rename this file to "prepare-commit-msg".
-
-# This hook includes three examples.  The first comments out the
-# "Conflicts:" part of a merge commit.
-#
-# The second includes the output of "git diff --name-status -r"
-# into the message, just before the "git status" output.  It is
-# commented because it doesn't cope with --amend or with squashed
-# commits.
-#
-# The third example adds a Signed-off-by line to the message, that can
-# still be edited.  This is rarely a good idea.
-
-case "$2,$3" in
-  merge,)
-    /usr/bin/perl -i.bak -ne 's/^/# /, s/^# #/#/ if /^Conflicts/ .. /#/; print' "$1" ;;
-
-# ,|template,)
-#   /usr/bin/perl -i.bak -pe '
-#      print "\n" . `git diff --cached --name-status -r`
-#	 if /^#/ && $first++ == 0' "$1" ;;
-
-  *) ;;
-esac
-
-# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
-# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
diff --git a/deepspeed.git/hooks/update.sample b/deepspeed.git/hooks/update.sample
deleted file mode 100755
index d84758373da1d3121ec907435ae5d598c5c14b26..0000000000000000000000000000000000000000
--- a/deepspeed.git/hooks/update.sample
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/bin/sh
-#
-# An example hook script to blocks unannotated tags from entering.
-# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
-#
-# To enable this hook, rename this file to "update".
-#
-# Config
-# ------
-# hooks.allowunannotated
-#   This boolean sets whether unannotated tags will be allowed into the
-#   repository.  By default they won't be.
-# hooks.allowdeletetag
-#   This boolean sets whether deleting tags will be allowed in the
-#   repository.  By default they won't be.
-# hooks.allowmodifytag
-#   This boolean sets whether a tag may be modified after creation. By default
-#   it won't be.
-# hooks.allowdeletebranch
-#   This boolean sets whether deleting branches will be allowed in the
-#   repository.  By default they won't be.
-# hooks.denycreatebranch
-#   This boolean sets whether remotely creating branches will be denied
-#   in the repository.  By default this is allowed.
-#
-
-# --- Command line
-refname="$1"
-oldrev="$2"
-newrev="$3"
-
-# --- Safety check
-if [ -z "$GIT_DIR" ]; then
-	echo "Don't run this script from the command line." >&2
-	echo " (if you want, you could supply GIT_DIR then run" >&2
-	echo "  $0 <ref> <oldrev> <newrev>)" >&2
-	exit 1
-fi
-
-if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
-	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
-	exit 1
-fi
-
-# --- Config
-allowunannotated=$(git config --bool hooks.allowunannotated)
-allowdeletebranch=$(git config --bool hooks.allowdeletebranch)
-denycreatebranch=$(git config --bool hooks.denycreatebranch)
-allowdeletetag=$(git config --bool hooks.allowdeletetag)
-allowmodifytag=$(git config --bool hooks.allowmodifytag)
-
-# check for no description
-projectdesc=$(sed -e '1q' "$GIT_DIR/description")
-case "$projectdesc" in
-"Unnamed repository"* | "")
-	echo "*** Project description file hasn't been set" >&2
-	exit 1
-	;;
-esac
-
-# --- Check types
-# if $newrev is 0000...0000, it's a commit to delete a ref.
-zero="0000000000000000000000000000000000000000"
-if [ "$newrev" = "$zero" ]; then
-	newrev_type=delete
-else
-	newrev_type=$(git cat-file -t $newrev)
-fi
-
-case "$refname","$newrev_type" in
-	refs/tags/*,commit)
-		# un-annotated tag
-		short_refname=${refname##refs/tags/}
-		if [ "$allowunannotated" != "true" ]; then
-			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
-			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,delete)
-		# delete tag
-		if [ "$allowdeletetag" != "true" ]; then
-			echo "*** Deleting a tag is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/tags/*,tag)
-		# annotated tag
-		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
-		then
-			echo "*** Tag '$refname' already exists." >&2
-			echo "*** Modifying a tag is not allowed in this repository." >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,commit)
-		# branch
-		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
-			echo "*** Creating a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/heads/*,delete)
-		# delete branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	refs/remotes/*,commit)
-		# tracking branch
-		;;
-	refs/remotes/*,delete)
-		# delete tracking branch
-		if [ "$allowdeletebranch" != "true" ]; then
-			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
-			exit 1
-		fi
-		;;
-	*)
-		# Anything else (is there anything else?)
-		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
-		exit 1
-		;;
-esac
-
-# --- Finished
-exit 0
diff --git a/deepspeed.git/info/exclude b/deepspeed.git/info/exclude
deleted file mode 100644
index a5196d1be8fb59edf8062bef36d3a602e0812139..0000000000000000000000000000000000000000
--- a/deepspeed.git/info/exclude
+++ /dev/null
@@ -1,6 +0,0 @@
-# git ls-files --others --exclude-from=.git/info/exclude
-# Lines that start with '#' are comments.
-# For a project mostly in C, the following would be a good set of
-# exclude patterns (uncomment them if you want to use them):
-# *.[oa]
-# *~
diff --git a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.idx b/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.idx
deleted file mode 100644
index 9e60fd2214c9c7ba84fb92cd4801b890e883470a..0000000000000000000000000000000000000000
Binary files a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.idx and /dev/null differ
diff --git a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.pack b/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.pack
deleted file mode 100644
index 697f41f6467b584adcfcb562df71a6316a5963b2..0000000000000000000000000000000000000000
Binary files a/deepspeed.git/objects/pack/pack-4e2c4f6aff789cbda1628c6e8d010508e6ff39f2.pack and /dev/null differ
diff --git a/deepspeed.git/packed-refs b/deepspeed.git/packed-refs
deleted file mode 100644
index 051d902cf92250877a064ed99369b08d087c3816..0000000000000000000000000000000000000000
--- a/deepspeed.git/packed-refs
+++ /dev/null
@@ -1,22 +0,0 @@
-# pack-refs with: peeled fully-peeled 
-1b2721adcd96656bb1f27d1f2f60947567b2d505 refs/heads/deepspeed-0.6.3-rocm
-cd3feaaa6aef8e868eea954841294ab3a2b16f84 refs/heads/ds-0.3.13-rocm
-67ea635fe037707924417893674e94275e849d7e refs/heads/ds-v0.8.2-rocm
-67ea635fe037707924417893674e94275e849d7e refs/heads/main
-87833e1f85e006c5c5d618dbe4de700885a2f571 refs/tags/grad-norm-test
-c61e23b4b108df2af0dda7939ee59d4ae9090415 refs/tags/v0.1.0
-96c4daabc162c3c05fe602152ee2ab2d780c0e23 refs/tags/v0.2.0
-4b1df25ae96e3732213877e7729c5e15548188fd refs/tags/v0.3.0
-31f46feee2d491d58a13404e354440551de9d5bf refs/tags/v0.3.1
-c14b839d9898f4c84e372e896e3ce8fa2e169a79 refs/tags/v0.3.10
-72b23ea32282c52c53a81a097dfc26c653d3a731 refs/tags/v0.3.11
-35fd7ccd862adcb93febd546cb5b9fa7cb883d8f refs/tags/v0.3.12
-12a53b43833b7bea279a205e313f2bd3f0cdfd99 refs/tags/v0.3.13
-9941ce75225868ef9222a0360683a563d05d87ad refs/tags/v0.3.2
-9de21b72b5e8adb6c1fe4ae96cbddaa929178cc1 refs/tags/v0.3.3
-6b28bc5db58fa95628b9cf69e350dcacc2f33478 refs/tags/v0.3.4
-16313a962bce9df567597ffc2380250a1535e27a refs/tags/v0.3.5
-73c3262df63e85c2b2f0d48bf9217c58508e44f3 refs/tags/v0.3.6
-c51fa65de847ba44f0a1bcfc9957cb4e5fae3ab6 refs/tags/v0.3.7
-cb7c7da6f7696e27591610db3c2c906f9c2c8070 refs/tags/v0.3.8
-81aeea361da3936b875a678b9cb44596800510b5 refs/tags/v0.3.9
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 9e2e25513a723a0133ac45e1468d55e567a33872..255dacdccf6ef718ff7f40a6970c1feada6e2425 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import types
@@ -14,8 +15,10 @@ from packaging import version as pkg_version
 from . import ops
 from . import module_inject
 
+from .accelerator import get_accelerator
 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
+from .runtime.hybrid_engine import DeepSpeedHybridEngine
 from .runtime.pipe.engine import PipelineEngine
 from .inference.engine import InferenceEngine
 from .inference.config import DeepSpeedInferenceConfig
@@ -25,7 +28,7 @@ from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
 
-from .utils import log_dist, OnDevice
+from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 
 from .runtime import zero
@@ -48,15 +51,16 @@ __version_major__, __version_minor__, __version_patch__ = _parse_version(__versi
 __git_hash__ = git_hash
 __git_branch__ = git_branch
 
+# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
+dist = None
+
 
 def initialize(args=None,
                model: torch.nn.Module = None,
-               optimizer: Optional[Union[Optimizer,
-                                         DeepSpeedOptimizerCallable]] = None,
+               optimizer: Optional[Union[Optimizer, DeepSpeedOptimizerCallable]] = None,
                model_parameters: Optional[torch.nn.Module] = None,
                training_data: Optional[torch.utils.data.Dataset] = None,
-               lr_scheduler: Optional[Union[_LRScheduler,
-                                            DeepSpeedSchedulerCallable]] = None,
+               lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None,
                mpu=None,
                dist_init_required: Optional[bool] = None,
                collate_fn=None,
@@ -110,10 +114,8 @@ def initialize(args=None,
         * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
           if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
     """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        __version__,
-        __git_hash__,
-        __git_branch__),
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
+                                                                             __git_branch__),
              ranks=[0])
 
     # Disable zero.Init context if it's currently enabled
@@ -121,38 +123,73 @@ def initialize(args=None,
 
     assert model is not None, "deepspeed.initialize requires a model"
 
+    global dist
+    from deepspeed import comm as dist
+    dist_backend = get_accelerator().communication_backend_name()
+    dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required)
+
+    # Set config using config_params for backwards compat
+    if config is None and config_params is not None:
+        config = config_params
+
+    # Check for deepscale_config for backwards compat
+    if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
+        logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
+        if hasattr(args, "deepspeed_config"):
+            assert (args.deepspeed_config is
+                    None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+        args.deepspeed_config = args.deepscale_config
+        args.deepscale_config = None
+
+    # Check that we have only one config passed
+    if hasattr(args, "deepspeed_config") and args.deepspeed_config is not None:
+        assert config is None, "Not sure how to proceed, we were given deepspeed configs in the deepspeed arguments and deepspeed.initialize() function call"
+        config = args.deepspeed_config
+    assert config != None, "DeepSpeed requires --deepspeed_config to specify configuration file"
+
     if not isinstance(model, PipelineModule):
-        engine = DeepSpeedEngine(args=args,
-                                 model=model,
-                                 optimizer=optimizer,
-                                 model_parameters=model_parameters,
-                                 training_data=training_data,
-                                 lr_scheduler=lr_scheduler,
-                                 mpu=mpu,
-                                 dist_init_required=dist_init_required,
-                                 collate_fn=collate_fn,
-                                 config=config,
-                                 config_params=config_params)
+        config_class = DeepSpeedConfig(config, mpu)
+        if config_class.hybrid_engine.enabled:
+            engine = DeepSpeedHybridEngine(args=args,
+                                           model=model,
+                                           optimizer=optimizer,
+                                           model_parameters=model_parameters,
+                                           training_data=training_data,
+                                           lr_scheduler=lr_scheduler,
+                                           mpu=mpu,
+                                           dist_init_required=dist_init_required,
+                                           collate_fn=collate_fn,
+                                           config=config,
+                                           config_class=config_class)
+        else:
+            engine = DeepSpeedEngine(args=args,
+                                     model=model,
+                                     optimizer=optimizer,
+                                     model_parameters=model_parameters,
+                                     training_data=training_data,
+                                     lr_scheduler=lr_scheduler,
+                                     mpu=mpu,
+                                     dist_init_required=dist_init_required,
+                                     collate_fn=collate_fn,
+                                     config=config,
+                                     config_class=config_class)
     else:
         assert mpu is None, "mpu must be None with pipeline parallelism"
+        mpu = model.mpu()
+        config_class = DeepSpeedConfig(config, mpu)
         engine = PipelineEngine(args=args,
                                 model=model,
                                 optimizer=optimizer,
                                 model_parameters=model_parameters,
                                 training_data=training_data,
                                 lr_scheduler=lr_scheduler,
-                                mpu=model.mpu(),
+                                mpu=mpu,
                                 dist_init_required=dist_init_required,
                                 collate_fn=collate_fn,
                                 config=config,
-                                config_params=config_params)
-
-    return_items = [
-        engine,
-        engine.optimizer,
-        engine.training_dataloader,
-        engine.lr_scheduler
-    ]
+                                config_class=config_class)
+
+    return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler]
     return tuple(return_items)
 
 
@@ -171,38 +208,28 @@ def _add_core_arguments(parser):
     """
     group = parser.add_argument_group('DeepSpeed', 'DeepSpeed configurations')
 
-    group.add_argument(
-        '--deepspeed',
-        default=False,
-        action='store_true',
-        help=
-        'Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
+    group.add_argument('--deepspeed',
+                       default=False,
+                       action='store_true',
+                       help='Enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
 
-    group.add_argument('--deepspeed_config',
-                       default=None,
-                       type=str,
-                       help='DeepSpeed json configuration file.')
+    group.add_argument('--deepspeed_config', default=None, type=str, help='DeepSpeed json configuration file.')
 
-    group.add_argument(
-        '--deepscale',
-        default=False,
-        action='store_true',
-        help=
-        'Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)'
-    )
+    group.add_argument('--deepscale',
+                       default=False,
+                       action='store_true',
+                       help='Deprecated enable DeepSpeed (helper flag for user code, no impact on DeepSpeed backend)')
 
     group.add_argument('--deepscale_config',
                        default=None,
                        type=str,
                        help='Deprecated DeepSpeed json configuration file.')
 
-    group.add_argument(
-        '--deepspeed_mpi',
-        default=False,
-        action='store_true',
-        help=
-        "Run via MPI, this will attempt to discover the necessary variables to initialize torch "
-        "distributed from the MPI environment")
+    group.add_argument('--deepspeed_mpi',
+                       default=False,
+                       action='store_true',
+                       help="Run via MPI, this will attempt to discover the necessary variables to initialize torch "
+                       "distributed from the MPI environment")
 
     return parser
 
@@ -278,10 +305,8 @@ def init_inference(model, config=None, **kwargs):
     Returns:
         A deepspeed.InferenceEngine wrapped model.
     """
-    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        __version__,
-        __git_hash__,
-        __git_branch__),
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(__version__, __git_hash__,
+                                                                             __git_branch__),
              ranks=[0])
 
     # Load config_dict from config first
@@ -293,17 +318,14 @@ def init_inference(model, config=None, **kwargs):
     elif isinstance(config, dict):
         config_dict = config
     else:
-        raise ValueError(
-            f"'config' argument expected string or dictionary, got {type(config)}")
+        raise ValueError(f"'config' argument expected string or dictionary, got {type(config)}")
 
     # Update with values from kwargs, ensuring no conflicting overlap between config and kwargs
     overlap_keys = set(config_dict.keys()).intersection(kwargs.keys())
     # If there is overlap, error out if values are different
     for key in overlap_keys:
         if config_dict[key] != kwargs[key]:
-            raise ValueError(
-                f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}"
-            )
+            raise ValueError(f"Conflicting argument '{key}' in 'config':{config_dict[key]} and kwargs:{kwargs[key]}")
     config_dict.update(kwargs)
 
     ds_inference_config = DeepSpeedInferenceConfig(**config_dict)
diff --git a/deepspeed/autotuning/__init__.py b/deepspeed/autotuning/__init__.py
index 11ad4700017aea6ee82f65d5e70d490b5e60f81f..73d8153bab920a8e6865a7fd4e59a068e9a2993b 100644
--- a/deepspeed/autotuning/__init__.py
+++ b/deepspeed/autotuning/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .autotuner import Autotuner
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
index 569b62666fc0986a78354e5a50184a7a71dc2ed6..b27440294a825187d54a7b16a1bb212c2a655881 100755
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import shutil
 import subprocess
@@ -40,6 +43,7 @@ class Autotuner:
     """The DeepSpeed Autotuner automatically discovers the optimal DeepSpeed configuration that delivers good training speed. The Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations. It not only reduces the time and resources user spend on tuning, but also can discover configurations better than hand-tuned methods.
     Autotuning with DeepSpeed requires no code change from DeepSpeed users. Please refer to the README for usage details.
     """
+
     def __init__(self, args, active_resources):
         self.args = args
         self.selected_exp_dir = None
@@ -77,7 +81,7 @@ class Autotuner:
         if not os.path.exists(self.results_dir):
             try:
                 os.makedirs(self.results_dir, exist_ok=True)
-                logger.info(f"Created autotuning resutls directory: {self.exps_dir}")
+                logger.info(f"Created autotuning results directory: {self.exps_dir}")
             except:
                 logger.error(
                     f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
@@ -92,7 +96,8 @@ class Autotuner:
 
         assert self.exp_num_gpus <= self.rm.num_gpus_per_node, "num_gpus in the autotuning configuration must not be less than the --num_gpus value in the train script if any"
         assert self.exp_num_nodes <= len(
-            self.rm.nodes), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
+            self.rm.nodes
+        ), "num_nodes in the autotuning configuration must not be less than the --num_nodes value in the train script if any"
 
         self.records = {}
         self.optimal_cmd = None
@@ -125,18 +130,10 @@ class Autotuner:
                 row.append(val[0]['name'])
                 tab.append(row)
             summary = tabulate(tab,
-                               headers=[
-                                   "tuning_space",
-                                   "num_experiments",
-                                   "best_metric_val",
-                                   "best_exp_name"
-                               ],
+                               headers=["tuning_space", "num_experiments", "best_metric_val", "best_exp_name"],
                                tablefmt="pipe")
             print(summary)
-            with open(os.path.join(self.results_dir,
-                                   'summary.txt'),
-                      'w',
-                      buffering=BUFSIZE) as fd:
+            with open(os.path.join(self.results_dir, 'summary.txt'), 'w', buffering=BUFSIZE) as fd:
                 fd.write(summary)
                 fd.flush()
                 os.fsync(fd)
@@ -148,9 +145,7 @@ class Autotuner:
                     f"{best_exp['name']} is the optimal setup after tuning. The exp result is at {best_exp['result_dir']}."
                 )
             else:
-                logger.info(
-                    f"No optimal setup is found. Please check that experiments were run successfully."
-                )
+                logger.info(f"No optimal setup is found. Please check that experiments were run successfully.")
             tuning_duration = datetime.timedelta(seconds=(time.time() - self.start_time))
 
             logger.info(f"Tuning completed in {tuning_duration}")
@@ -172,8 +167,8 @@ class Autotuner:
         user_config_file = None
         if "--deepspeed_config" in user_args:
             idx = user_args.index("--deepspeed_config")
-            assert ".json" in user_args[idx +
-                                        1],  "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
+            assert ".json" in user_args[
+                idx + 1], "DeepSpeed --deepspeed_config requires a json file to specify the configuration"
 
             user_config_file = user_args[idx + 1]
         elif "--deepspeed" in user_args:
@@ -183,15 +178,10 @@ class Autotuner:
 
         logger.debug(f"user_config_file = {user_config_file}")
         if user_config_file is not None:
-            assert os.path.isfile(
-                user_config_file
-            ), "DeepSpeed configuration file: {} is not an existing file".format(
-                user_config_file
-            )
+            assert os.path.isfile(user_config_file), "DeepSpeed configuration file: {} is not an existing file".format(
+                user_config_file)
             if os.path.exists(user_config_file):
-                return json.load(open(user_config_file,
-                                      "r"),
-                                 object_pairs_hook=dict_raise_error_on_duplicate_keys)
+                return json.load(open(user_config_file, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
 
         return None
 
@@ -258,13 +248,11 @@ class Autotuner:
         return self.autotuning_config.mp_size
 
     def max_train_micro_batch_size_per_gpu(self):
-        if self.max_train_batch_size() and self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
-            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size(
-            ) // (self.exp_num_gpus * self.exp_num_nodes
-                  )  # gradient accumulation steps >=1
-            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu,
-                       max_train_micro_batch_size)
+        if self.max_train_batch_size(
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+            max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // (
+                self.exp_num_gpus * self.exp_num_nodes)  # gradient accumulation steps >=1
+            return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size)
         else:
             return self.autotuning_config.max_train_micro_batch_size_per_gpu
 
@@ -361,19 +349,14 @@ class Autotuner:
             if model_info and "hidden_size" in model_info:
                 hs = model_info["hidden_size"]
                 template_config[ZERO_OPTIMIZATION]['reduce_bucket_size'] = hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    'stage3_prefetch_bucket_size'] = 0.9 * hs * hs
-                template_config[ZERO_OPTIMIZATION][
-                    'stage3_param_persistence_threshold'] = 10 * hs
+                template_config[ZERO_OPTIMIZATION]['stage3_prefetch_bucket_size'] = 0.9 * hs * hs
+                template_config[ZERO_OPTIMIZATION]['stage3_param_persistence_threshold'] = 10 * hs
             prefix = "z3_"
         else:
             return exps
 
         # replace the corresponding parameter values if the user specifies them in the DeepSpeed configuration file
-        replace_dict(tuning_space,
-                     self.user_config,
-                     [ZERO_OPTIMIZATION,
-                      TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+        replace_dict(tuning_space, self.user_config, [ZERO_OPTIMIZATION, TRAIN_MICRO_BATCH_SIZE_PER_GPU])
 
         logger.debug(f"tuning_space = {json.dumps(tuning_space)}")
 
@@ -397,11 +380,9 @@ class Autotuner:
             # if the config does not use offloading, remove the offloading section
             config_zero = config.get(ZERO_OPTIMIZATION, None)
             if config_zero:
-                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[
-                        ZERO_OPTIMIZATION]:
+                if OFFLOAD_OPTIMIZER not in config_zero and OFFLOAD_OPTIMIZER in exp_config[ZERO_OPTIMIZATION]:
                     del exp_config[ZERO_OPTIMIZATION][OFFLOAD_OPTIMIZER]
-                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[
-                        ZERO_OPTIMIZATION]:
+                if OFFLOAD_PARAM not in config_zero and OFFLOAD_PARAM in exp_config[ZERO_OPTIMIZATION]:
                     del exp_config[ZERO_OPTIMIZATION][OFFLOAD_PARAM]
             # set gradient accumulation steps according to max_train_batch_size_per_gpu
             mbs = exp_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -438,13 +419,10 @@ class Autotuner:
         else:
             return
 
-        logger.info(
-            f"The model has {number_to_string(self.get_model_num_params())} parameters.")
+        logger.info(f"The model has {number_to_string(self.get_model_num_params())} parameters.")
 
         self.gpu_mem = self.get_gpu_memory_info()
-        logger.info(
-            f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}."
-        )
+        logger.info(f"Memory per GPU in the system is {memory_to_string(self.gpu_mem, postfix='B')}.")
 
         self.activation_mem = self.get_activation_memory_per_gpu()
         logger.info(
@@ -452,9 +430,7 @@ class Autotuner:
         )
 
         #TODO: FIX THIS
-        stage = self.user_config.get(ZERO_OPTIMIZATION,
-                                     {}).get(ZERO_OPTIMIZATION_STAGE,
-                                             "all")
+        stage = self.user_config.get(ZERO_OPTIMIZATION, {}).get(ZERO_OPTIMIZATION_STAGE, "all")
         stage = "all"
         user_zero_stages = [stage] if not isinstance(stage, list) else stage
         logger.info(f"User-defined zero stages are {stage}.")
@@ -463,15 +439,13 @@ class Autotuner:
         max_mbs = 0
         metric_val = 0
 
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZeroStageEnum.disabled) + self.activation_mem
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.disabled) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
             if "all" in user_zero_stages or ZeroStageEnum.disabled in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 0 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1), adding DEFAULT_TUNING_SPACE_ZERO_0 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_0)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_0)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -490,8 +464,10 @@ class Autotuner:
                 logger.info(
                     f"The model might be runable with ZERO 1 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_1 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_1, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_1,
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -510,8 +486,10 @@ class Autotuner:
                 logger.info(
                     f"The model might be runable with ZERO 2 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_2 to the global tuning space"
                 )
-                next_max_mbs, next_mbs, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_2, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                next_max_mbs, next_mbs, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_2,
+                                                                          prev_max_mbs=max_mbs,
+                                                                          prev_best_mbs=mbs,
+                                                                          prev_best_metric_val=metric_val)
                 if next_mbs > mbs:
                     mbs = next_mbs
                     max_mbs = next_max_mbs
@@ -523,15 +501,16 @@ class Autotuner:
                 f"The model is not runable with ZERO stage {ZeroStageEnum.gradients} (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory with mbs = 1)"
             )
 
-        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(
-            ZeroStageEnum.weights) + self.activation_mem
+        required_gpu_mem = self.get_instantiation_memory_required_per_gpu(ZeroStageEnum.weights) + self.activation_mem
         if self.gpu_mem > required_gpu_mem:
             if "all" in user_zero_stages or ZeroStageEnum.weights in user_zero_stages:
                 logger.info(
                     f"The model might be runable with ZERO 3 (which requires at least {memory_to_string(required_gpu_mem, postfix='B')} memory), adding DEFAULT_TUNING_SPACE_ZERO_3 to the global tuning space"
                 )
-                _, _, next_metric_val = self.tune_space(
-                    DEFAULT_TUNING_SPACE_ZERO_3, prev_max_mbs = max_mbs, prev_best_mbs=mbs, prev_best_metric_val=metric_val)
+                _, _, next_metric_val = self.tune_space(DEFAULT_TUNING_SPACE_ZERO_3,
+                                                        prev_max_mbs=max_mbs,
+                                                        prev_best_mbs=mbs,
+                                                        prev_best_metric_val=metric_val)
                 if has_mlflow:
                     mlflow.log_metric(f"z3{self.metric()}", next_metric_val)
         else:
@@ -542,11 +521,7 @@ class Autotuner:
         if has_mlflow:
             mlflow.end_run()
 
-    def tune_space(self,
-                   tuning_space,
-                   prev_max_mbs=0,
-                   prev_best_mbs=0,
-                   prev_best_metric_val=0):
+    def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_metric_val=0):
         config_zero = tuning_space.get(ZERO_OPTIMIZATION, {})
         stage = config_zero.get(ZERO_OPTIMIZATION_STAGE, None)
         tuning_space_name = TUNING_MICRO_BATCH_SIZE_PREFIX + str(stage)
@@ -557,26 +532,20 @@ class Autotuner:
         # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
         # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
         calculated_max_micro_batch_size = int(
-            self.gpu_mem -
-            self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
+            self.gpu_mem - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
         logger.info(
             f"Start tuning for space {tuning_space_name}, calculated_max_micro_batch_size = {calculated_max_micro_batch_size}"
         )
 
         if calculated_max_micro_batch_size < prev_max_mbs:
-            logger.info(
-                f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-            )
+            logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
             return 0, 0, 0
 
         if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
-                list):
+                self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], list):
             # user-specified micro batch size per gpu is a list which overwrites the default tuning behavior
             tuning_micro_batch_sizes = [
-                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU]
-                if isinstance(s,
-                              int)
+                s for s in self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] if isinstance(s, int)
             ]
             gas = self.get_gas_from_user_config()
             min_micro_batch_size = min(tuning_micro_batch_sizes)
@@ -589,9 +558,7 @@ class Autotuner:
                 stage, prev_max_mbs, calculated_max_micro_batch_size)
 
             if max_micro_batch_size < prev_max_mbs:
-                logger.info(
-                    f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}"
-                )
+                logger.info(f"No need to tune Zero stage {stage}. End tuning for space {tuning_space_name}")
                 return 0, 0, 0
 
             tuning_micro_batch_sizes, max_train_batch_size_per_gpu = self.get_tuning_micro_batch_size_list(
@@ -609,19 +576,15 @@ class Autotuner:
             return 0, 0, 0
 
         # tune micro batch sizes and gradient accumulation steps given max_train_batch_size_per_gpu
-        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(
-            tuning_micro_batch_sizes,
-            max_train_batch_size_per_gpu,
-            min_micro_batch_size,
-            stage,
-            tuning_micro_batch_sizes_overwritten)
+        tuning_micro_batch_sizes = self.run_tuning_micro_batch_sizes(tuning_micro_batch_sizes,
+                                                                     max_train_batch_size_per_gpu,
+                                                                     min_micro_batch_size, stage,
+                                                                     tuning_micro_batch_sizes_overwritten)
 
         fast_best_record = self.get_best_space_record(tuning_space_name)
         fast_best_metric_val = fast_best_record[1] if fast_best_record else 0
-        fast_best_mbs = fast_best_record[0][DS_CONFIG][
-            TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
-        logger.info(
-            f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
+        fast_best_mbs = fast_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if fast_best_record else 0
+        logger.info(f"fast_best_mbs = {fast_best_mbs}, name = {fast_best_record[0]['name']}")
 
         if self.fast_enabled() or stage == 0:
             logger.info(f"End tuning for space: {tuning_space_name}")
@@ -631,8 +594,7 @@ class Autotuner:
         if stage > 0:
             if fast_best_mbs <= prev_best_mbs or fast_best_metric_val < prev_best_metric_val:
                 logger.info(
-                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters."
-                )
+                    f"End tuning for space: {tuning_space_name}. No need to tune other Zero configuration parameters.")
                 return max_micro_batch_size, fast_best_mbs, fast_best_metric_val
 
         tuning_space[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = tuning_micro_batch_sizes
@@ -654,8 +616,7 @@ class Autotuner:
         else:
             t = GridSearchTuner(exps, self.rm, self.metric())
 
-        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (
-            self.exp_num_gpus * self.exp_num_nodes)
+        sample_size = len(self.rm.nodes) * self.rm.num_gpus_per_node // (self.exp_num_gpus * self.exp_num_nodes)
         num_exps = t.tune(sample_size=sample_size,
                           n_trials=self.autotuning_config.tuner_num_trials,
                           early_stopping=self.autotuning_config.tuner_early_stopping)
@@ -669,8 +630,7 @@ class Autotuner:
 
         if full_best_metric_val > fast_best_metric_val:
             best_metric_val = full_best_metric_val
-            best_mbs = full_best_record[0][DS_CONFIG][
-                TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
+            best_mbs = full_best_record[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] if full_best_record else -1
         else:
             best_metric_val = fast_best_metric_val
             best_mbs = fast_best_mbs
@@ -682,9 +642,7 @@ class Autotuner:
         if tuning_space_name not in self.records:
             return 0
         space_records = self.records[tuning_space_name]
-        sorted_space_records = sorted(
-            space_records,
-            key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
+        sorted_space_records = sorted(space_records, key=lambda x: x[0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU])
         prev_metric_val = None
         prev_micro_batch_size = 0
         for (exp, metric_val, _) in sorted_space_records:
@@ -692,8 +650,7 @@ class Autotuner:
                 if metric_val < prev_metric_val:
                     break
                 if (metric_val >= prev_metric_val
-                        and (metric_val - prev_metric_val) / prev_metric_val <
-                        METRIC_PERCENT_DIFF_CONST):
+                        and (metric_val - prev_metric_val) / prev_metric_val < METRIC_PERCENT_DIFF_CONST):
                     break
             prev_metric_val = metric_val
             prev_micro_batch_size = exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]
@@ -718,16 +675,8 @@ class Autotuner:
         ds_config = copy.deepcopy(self.user_config)
         replace_dict(ds_config, DEFAULT_MIN_MEM_CONFIG)
 
-        model_info_path = os.path.join(self.results_dir,
-                                       "profile_model_info",
-                                       "model_info.json")
-        ds_config[AUTOTUNING] = {
-            "enabled": True,
-            "model_info_path": model_info_path,
-            "model_info": {
-                "profile": True
-            }
-        }
+        model_info_path = os.path.join(self.results_dir, "profile_model_info", "model_info.json")
+        ds_config[AUTOTUNING] = {"enabled": True, "model_info_path": model_info_path, "model_info": {"profile": True}}
 
         exp_config = {}
         exp_name = "profile_model_info"
@@ -748,8 +697,7 @@ class Autotuner:
         for exp_id, (exp_json, err) in self.rm.finished_experiments.items():
             self.rm.clear()
             if err:
-                logger.error(
-                    f"The model is not runnable with DeepSpeed with error = {err}")
+                logger.error(f"The model is not runnable with DeepSpeed with error = {err}")
                 return None
 
         if os.path.exists(model_info_path):
@@ -790,12 +738,8 @@ class Autotuner:
             best_space_records[GLOBAL_TUNING_SPACE] = global_best_record
         return best_space_records
 
-    def run_tuning_micro_batch_sizes(self,
-                                     tuning_micro_batch_sizes,
-                                     max_train_batch_size_per_gpu,
-                                     min_micro_batch_size,
-                                     stage,
-                                     tuning_micro_batch_sizes_overwritten):
+    def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch_size_per_gpu,
+                                     min_micro_batch_size, stage, tuning_micro_batch_sizes_overwritten):
         assert tuning_micro_batch_sizes, "the tuning micro batch size list is empty"
         tuning_micro_batch_sizes.sort()
         max_micro_batch_size = tuning_micro_batch_sizes[-1]
@@ -838,8 +782,7 @@ class Autotuner:
                         results = hjson.load(f)
                         metric_val = results[self.metric()]
                         self.update_records(tuning_space_name, exp, metric_val, 1)
-                        if max_micro_batch_size == exp[DS_CONFIG][
-                                TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
+                        if max_micro_batch_size == exp[DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU]:
                             max_micro_batch_size_metric_val = metric_val
                         if has_mlflow:
                             os.environ.pop('MLFLOW_RUN_ID')
@@ -862,9 +805,8 @@ class Autotuner:
         # in a auto-detected tuning_micro_batch_sizs list, max_micro_batch_size might not be performant as the memory consumption is close to max
         # try smaller values while gas stays the same
         # if finding a more performant mbs value, use it to replace max_micro_batch_size in the list
-        min_micro_batch_size_with_same_gas = (
-            tuning_micro_batch_sizes[-2] +
-            1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
+        min_micro_batch_size_with_same_gas = (tuning_micro_batch_sizes[-2] +
+                                              1) if len(tuning_micro_batch_sizes) > 1 else min_micro_batch_size
 
         prev_best_metric_val = max_micro_batch_size_metric_val
         prev_best_mbs = max_micro_batch_size
@@ -872,10 +814,7 @@ class Autotuner:
         stride = (max_micro_batch_size - min_micro_batch_size_with_same_gas) // 3
         if stride == 0:
             stride = 1
-        for mbs in reversed(
-                range(min_micro_batch_size_with_same_gas,
-                      max_micro_batch_size,
-                      stride)):
+        for mbs in reversed(range(min_micro_batch_size_with_same_gas, max_micro_batch_size, stride)):
             ds_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU] = mbs
             gas = max_train_batch_size_per_gpu // mbs
             ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
@@ -908,10 +847,7 @@ class Autotuner:
             tuning_micro_batch_sizes[-1] = prev_best_mbs
         return tuning_micro_batch_sizes
 
-    def get_min_max_micro_batch_size(self,
-                                     stage,
-                                     min_micro_batch_size,
-                                     calculated_max_micro_batch_size):
+    def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_max_micro_batch_size):
         # get min and max micro batch size with gradient accumulation steps = 1
         if min_micro_batch_size > calculated_max_micro_batch_size:
             return -1, -1
@@ -927,8 +863,7 @@ class Autotuner:
         # search for the min micro batch size
         if min_micro_batch_size < 1:
             if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self.user_config and isinstance(
-                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU],
-                    int):
+                    self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU], int):
                 # user specifies train_micro_batch_size_per_gpu as an int
                 mbs = int(self.user_config[TRAIN_MICRO_BATCH_SIZE_PER_GPU])
             else:
@@ -951,8 +886,7 @@ class Autotuner:
                 min_micro_batch_size = mbs
             else:
                 self.update_records(tuning_space_name, exp, 0, 1)
-                logger.info(
-                    f"User-specified micro batch size per GPU {mbs} does not run")
+                logger.info(f"User-specified micro batch size per GPU {mbs} does not run")
                 if self.min_train_micro_batch_size_per_gpu() == mbs:
                     return -1, -1
                 mbs = self.min_train_micro_batch_size_per_gpu()
@@ -964,8 +898,7 @@ class Autotuner:
                 exp, metric_val = self.run_ds_config(ds_config, exp_name)
                 if not metric_val:
                     self.update_records(tuning_space_name, exp, 0, 1)
-                    logger.info(
-                        f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
+                    logger.info(f"min_train_micro_batch_size_per_gpu {mbs} is not runnable.")
                     return -1, -1
                 self.update_records(tuning_space_name, exp, metric_val, 1)
                 min_micro_batch_size = mbs
@@ -975,8 +908,7 @@ class Autotuner:
             ds_config[GRADIENT_ACCUMULATION_STEPS] = gas
             ds_config[TRAIN_BATCH_SIZE] = min_micro_batch_size * gas * \
                 self.exp_num_gpus * self.exp_num_nodes // self.mp_size()
-            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(
-                min_micro_batch_size)
+            exp_name = tuning_space_name + "_gas" + str(gas) + "_tmbspg" + str(min_micro_batch_size)
             exp, metric_val = self.run_ds_config(ds_config, exp_name)
             if metric_val:
                 self.update_records(tuning_space_name, exp, metric_val, 1)
@@ -986,13 +918,8 @@ class Autotuner:
                 return -1, -1
 
         # search for the max micro batch size
-        max_micro_batch_size = min(calculated_max_micro_batch_size,
-                                   self.max_train_micro_batch_size_per_gpu())
-        for mbs in [
-                math.ceil(1.05 * max_micro_batch_size),
-                max_micro_batch_size,
-                int(0.95 * max_micro_batch_size)
-        ]:
+        max_micro_batch_size = min(calculated_max_micro_batch_size, self.max_train_micro_batch_size_per_gpu())
+        for mbs in [math.ceil(1.05 * max_micro_batch_size), max_micro_batch_size, int(0.95 * max_micro_batch_size)]:
             if mbs > self.max_train_micro_batch_size_per_gpu():
                 continue
             if mbs in used_micro_batch_sizes:
@@ -1011,12 +938,11 @@ class Autotuner:
             else:
                 self.update_records(tuning_space_name, exp, 0, 1)
 
-        space_records = self.records[
-            tuning_space_name] if tuning_space_name in self.records else []
+        space_records = self.records[tuning_space_name] if tuning_space_name in self.records else []
         if space_records:
             prev_idx = min(range(len(space_records)),
-                           key=lambda i: abs(space_records[i][0][DS_CONFIG][
-                               TRAIN_MICRO_BATCH_SIZE_PER_GPU] - min_micro_batch_size))
+                           key=lambda i: abs(space_records[i][0][DS_CONFIG][TRAIN_MICRO_BATCH_SIZE_PER_GPU] -
+                                             min_micro_batch_size))
             prev_metric_val = space_records[prev_idx][1]
         else:
             prev_metric_val = None
@@ -1037,8 +963,8 @@ class Autotuner:
                     low = mid + 1
                     self.update_records(tuning_space_name, exp, metric_val, 1)
                     used_micro_batch_sizes.append(mid)
-                    if prev_metric_val and ((metric_val - prev_metric_val) /
-                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                    if prev_metric_val and (
+                        (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
                         logger.info(f"performance plateaus at mbs = {low}")
                         break
                     prev_metric_val = metric_val
@@ -1049,9 +975,7 @@ class Autotuner:
                 low = mid + 1
         max_micro_batch_size = low - 1
 
-        logger.info(
-            f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}."
-        )
+        logger.info(f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}.")
 
         return min_micro_batch_size, max_micro_batch_size
 
@@ -1067,8 +991,7 @@ class Autotuner:
                     gas = int(val)
             elif isinstance(gas_in_config, list):
                 logger.info(
-                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used."
-                )
+                    f"Specifying a list of {GRADIENT_ACCUMULATION_STEPS} to tune is not supported. 1 would be used.")
         assert gas > 0, "Gradient accumulation steps must be positive."
         return gas
 
@@ -1083,9 +1006,7 @@ class Autotuner:
                     return (user_args[idx + 1])
         return None
 
-    def get_tuning_micro_batch_size_list(self,
-                                         min_micro_batch_size,
-                                         max_micro_batch_size,
+    def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch_size,
                                          num_tuning_micro_batch_sizes):
         """Get a list of micro batch sizes to tune based on min and max values, as well as the size of the list.
         Args:
@@ -1098,17 +1019,16 @@ class Autotuner:
         """
         if min_micro_batch_size <= 0 or max_micro_batch_size <= 0:
             logger.info(
-                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}"
-            )
+                f"min_micro_batch_size = {min_micro_batch_size}, max_micro_batch_size = {max_micro_batch_size}")
             return [], 0
 
         # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
         # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
         # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
-        if self.max_train_batch_size() and self.max_train_batch_size(
-        ) > 0:  # if the user specifies a max_train_batch_size
-            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size(
-            ) // (self.exp_num_gpus * self.exp_num_nodes)
+        if self.max_train_batch_size(
+        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+            max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus *
+                                                                                            self.exp_num_nodes)
         else:
             gas = self.get_gas_from_user_config()
             max_train_batch_size_per_gpu = max_micro_batch_size * gas // self.mp_size()
@@ -1117,15 +1037,14 @@ class Autotuner:
             min_micro_batch_size = max_micro_batch_size // 2
 
         # constant stride
-        stride = (max_micro_batch_size -
-                  min_micro_batch_size) // num_tuning_micro_batch_sizes
+        stride = (max_micro_batch_size - min_micro_batch_size) // num_tuning_micro_batch_sizes
         if stride == 0:
             stride = 1
         ls = []
         min_gas = max_train_batch_size_per_gpu // max_micro_batch_size
         # if gas is the same as min_gas, do not add mbs to the tuning list
         for mbs in range(min_micro_batch_size, max_micro_batch_size, stride):
-            if max_micro_batch_size // mbs != min_gas:
+            if max_train_batch_size_per_gpu // mbs != min_gas:
                 ls.append(mbs)
         ls.append(max_micro_batch_size)
 
@@ -1187,8 +1106,6 @@ class Autotuner:
             result = subprocess.Popen(self.optimal_cmd)
             result.wait()
 
-            logger.info(
-                f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}"
-            )
+            logger.info(f"Done running with the optimal DeepSpeed configuration using {self.optimal_cmd}")
         else:
             logger.info(f"No optimal DeepSpeed configuration found by autotuning.")
diff --git a/deepspeed/autotuning/config.py b/deepspeed/autotuning/config.py
index 6f6b6903efc5a2f06030276979b680fd289e3f1c..6f58fb4e42965be30820809a1c8d6f424b209bd8 100644
--- a/deepspeed/autotuning/config.py
+++ b/deepspeed/autotuning/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, get_dict_param, DeepSpeedConfigObject
 from deepspeed.autotuning.constants import *
 
 
 class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedAutotuningConfig, self).__init__()
 
@@ -31,102 +31,65 @@ class DeepSpeedAutotuningConfig(DeepSpeedConfigObject):
         self._initialize(autotuning_dict)
 
     def _initialize(self, autotuning_dict):
-        self.enabled = get_scalar_param(autotuning_dict,
-                                        AUTOTUNING_ENABLED,
-                                        AUTOTUNING_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(autotuning_dict, AUTOTUNING_ENABLED, AUTOTUNING_ENABLED_DEFAULT)
 
-        self.fast = get_scalar_param(autotuning_dict,
-                                     AUTOTUNING_FAST,
-                                     AUTOTUNING_FAST_DEFAULT)
+        self.fast = get_scalar_param(autotuning_dict, AUTOTUNING_FAST, AUTOTUNING_FAST_DEFAULT)
 
-        self.results_dir = get_scalar_param(autotuning_dict,
-                                            AUTOTUNING_RESULTS_DIR,
-                                            AUTOTUNING_RESULTS_DIR_DEFAULT)
+        self.results_dir = get_scalar_param(autotuning_dict, AUTOTUNING_RESULTS_DIR, AUTOTUNING_RESULTS_DIR_DEFAULT)
         assert self.results_dir, "results_dir cannot be empty"
-        self.exps_dir = get_scalar_param(autotuning_dict,
-                                         AUTOTUNING_EXPS_DIR,
-                                         AUTOTUNING_EXPS_DIR_DEFAULT)
+        self.exps_dir = get_scalar_param(autotuning_dict, AUTOTUNING_EXPS_DIR, AUTOTUNING_EXPS_DIR_DEFAULT)
         assert self.exps_dir, "exps_dir cannot be empty"
-        self.overwrite = get_scalar_param(autotuning_dict,
-                                          AUTOTUNING_OVERWRITE,
-                                          AUTOTUNING_OVERWRITE_DEFAULT)
+        self.overwrite = get_scalar_param(autotuning_dict, AUTOTUNING_OVERWRITE, AUTOTUNING_OVERWRITE_DEFAULT)
 
-        self.start_profile_step = get_scalar_param(
-            autotuning_dict,
-            AUTOTUNING_START_PROFILE_STEP,
-            AUTOTUNING_START_PROFILE_STEP_DEFAULT)
+        self.start_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_START_PROFILE_STEP,
+                                                   AUTOTUNING_START_PROFILE_STEP_DEFAULT)
 
-        self.end_profile_step = get_scalar_param(autotuning_dict,
-                                                 AUTOTUNING_END_PROFILE_STEP,
+        self.end_profile_step = get_scalar_param(autotuning_dict, AUTOTUNING_END_PROFILE_STEP,
                                                  AUTOTUNING_END_PROFILE_STEP_DEFAULT)
 
-        self.metric = get_scalar_param(autotuning_dict,
-                                       AUTOTUNING_METRIC,
-                                       AUTOTUNING_METRIC_DEFAULT)
+        self.metric = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC, AUTOTUNING_METRIC_DEFAULT)
 
-        self.metric_path = get_scalar_param(autotuning_dict,
-                                            AUTOTUNING_METRIC_PATH,
-                                            AUTOTUNING_METRIC_PATH_DEFAULT)
+        self.metric_path = get_scalar_param(autotuning_dict, AUTOTUNING_METRIC_PATH, AUTOTUNING_METRIC_PATH_DEFAULT)
 
-        self.tuner_type = get_scalar_param(autotuning_dict,
-                                           AUTOTUNING_TUNER_TYPE,
-                                           AUTOTUNING_TUNER_TYPE_DEFAULT)
+        self.tuner_type = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_TYPE, AUTOTUNING_TUNER_TYPE_DEFAULT)
 
-        self.tuner_early_stopping = get_scalar_param(
-            autotuning_dict,
-            AUTOTUNING_TUNER_EARLY_STOPPING,
-            AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
+        self.tuner_early_stopping = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_EARLY_STOPPING,
+                                                     AUTOTUNING_TUNER_EARLY_STOPPING_DEFAULT)
 
-        self.tuner_num_trials = get_scalar_param(autotuning_dict,
-                                                 AUTOTUNING_TUNER_NUM_TRIALS,
+        self.tuner_num_trials = get_scalar_param(autotuning_dict, AUTOTUNING_TUNER_NUM_TRIALS,
                                                  AUTOTUNING_TUNER_NUM_TRIALS_DEFAULT)
 
-        self.arg_mappings = get_dict_param(autotuning_dict,
-                                           AUTOTUNING_ARG_MAPPINGS,
-                                           AUTOTUNING_ARG_MAPPINGS_DEFAULT)
+        self.arg_mappings = get_dict_param(autotuning_dict, AUTOTUNING_ARG_MAPPINGS, AUTOTUNING_ARG_MAPPINGS_DEFAULT)
 
         self.model_info = get_model_info_config(autotuning_dict)
 
-        self.model_info_path = get_scalar_param(autotuning_dict,
-                                                AUTOTUNING_MODEL_INFO_PATH,
+        self.model_info_path = get_scalar_param(autotuning_dict, AUTOTUNING_MODEL_INFO_PATH,
                                                 AUTOTUNING_MODEL_INFO_PATH_DEFAULT)
-        self.mp_size = get_scalar_param(autotuning_dict,
-                                        AUTOTUNING_MP_SIZE,
-                                        AUTOTUNING_MP_SIZE_DEFAULT)
+        self.mp_size = get_scalar_param(autotuning_dict, AUTOTUNING_MP_SIZE, AUTOTUNING_MP_SIZE_DEFAULT)
 
-        self.max_train_batch_size = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
+        self.max_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MAX_TRAIN_BATCH_SIZE,
+                                                   AUTOTUNING_MAX_TRAIN_BATCH_SIZE_DEFAULT)
 
-        self.min_train_batch_size = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
-            AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
+        self.min_train_batch_size = get_dict_param(autotuning_dict, AUTOTUNING_MIN_TRAIN_BATCH_SIZE,
+                                                   AUTOTUNING_MIN_TRAIN_BATCH_SIZE_DEFAULT)
 
         self.max_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            autotuning_dict, AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
             AUTOTUNING_MAX_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
 
         self.min_train_micro_batch_size_per_gpu = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
+            autotuning_dict, AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU,
             AUTOTUNING_MIN_TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
 
-        self.num_tuning_micro_batch_sizes = get_dict_param(
-            autotuning_dict,
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
-            AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
+        self.num_tuning_micro_batch_sizes = get_dict_param(autotuning_dict, AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES,
+                                                           AUTOTUNING_NUM_TUNING_MICRO_BATCH_SIZES_DEFAULT)
 
 
 def get_model_info_config(param_dict):
     if MODEL_INFO in param_dict and param_dict[MODEL_INFO] is not None:
         model_info_config = {}
         for key, default_value in MODEL_INFO_KEY_DEFAULT_DICT.items():
-            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO],
-                                                      key,
-                                                      default_value)
+            model_info_config[key] = get_scalar_param(param_dict[MODEL_INFO], key, default_value)
         return model_info_config
     return None
 
diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
index d0306bb09bb91e45e28da008e8cbb9ba50532a3b..e6a62e32fde3e5c9a3f01e4d1ffcb32a6ef2ee89 100644
--- a/deepspeed/autotuning/constants.py
+++ b/deepspeed/autotuning/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # autotunner implementation constants
@@ -10,17 +9,13 @@ Licensed under the MIT license.
 
 import os
 
-DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_0 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero0.json")
-DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_1 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero1.json")
-DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero2.json")
-DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                                            "config_templates",
+DEFAULT_TEMPLATE_PATH_ZERO_3 = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config_templates",
                                             "template_zero3.json")
 
 METRIC_PERCENT_DIFF_CONST = 0.05
@@ -157,50 +152,31 @@ DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}
 DEFAULT_TUNING_SPACE_ZERO_1 = {
     "zero_optimization": {
         "stage": 1,
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
     }
 }
 
 DEFAULT_TUNING_SPACE_ZERO_2 = {
     "zero_optimization": {
         "stage": 2,
-        "overlap_comm": [True,
-                         False],
-        "reduce_scatter": [False,
-                           True],
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
+        "overlap_comm": [True, False],
+        "reduce_scatter": [False, True],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
+        "contiguous_gradients": [False, True]
     },
 }
 
 DEFAULT_TUNING_SPACE_ZERO_3 = {
     "zero_optimization": {
         "stage": 3,
-        "overlap_comm": [True,
-                         False],
-        "reduce_scatter": [False,
-                           True],
-        "reduce_bucket_size": [5e7,
-                               5e8,
-                               1e9],
-        "allgather_partitions": [True,
-                                 False],
-        "allgather_bucket_size": [5e7,
-                                  5e8,
-                                  1e9],
-        "contiguous_gradients": [False,
-                                 True]
+        "overlap_comm": [True, False],
+        "reduce_scatter": [False, True],
+        "reduce_bucket_size": [5e7, 5e8, 1e9],
+        "allgather_partitions": [True, False],
+        "allgather_bucket_size": [5e7, 5e8, 1e9],
+        "contiguous_gradients": [False, True]
     },
 }
 
diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
index 2a4c0c70d95552c54844d00e763eeff338953087..40978aa00ab9487c2302d7b66fd460c41ae44091 100755
--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -28,13 +31,8 @@ TIMEOUT = 5
 
 
 class ResourceManager:
-    def __init__(self,
-                 args,
-                 hosts,
-                 num_gpus_per_node,
-                 results_dir,
-                 exps_dir,
-                 arg_mappings):
+
+    def __init__(self, args, hosts, num_gpus_per_node, results_dir, exps_dir, arg_mappings):
         self.results_dir = results_dir
         self.exps_dir = exps_dir
 
@@ -69,13 +67,10 @@ class ResourceManager:
                     exp["exp_id"] = self.experiment_count
                     self.experiment_count += 1
 
-                    result_dir = exp["result_dir"] = os.path.join(
-                        self.results_dir,
-                        exp['name'])
+                    result_dir = exp["result_dir"] = os.path.join(self.results_dir, exp['name'])
                     if AUTOTUNING in exp["ds_config"]:
                         metric_file = os.path.join(result_dir, "metrics.json")
-                        exp["ds_config"][AUTOTUNING][
-                            AUTOTUNING_METRIC_PATH] = metric_file
+                        exp["ds_config"][AUTOTUNING][AUTOTUNING_METRIC_PATH] = metric_file
                     stderr_file = os.path.join(result_dir, "stderr.log")
                     model_info_file = os.path.join(result_dir, "model_info.json")
                     metric_file = os.path.join(result_dir, "metrics.json")
@@ -86,11 +81,8 @@ class ResourceManager:
                             err = search_error(stderr_file)
                             exp_id = exp["exp_id"]
                             self.finished_experiments[exp_id] = (exp, err)
-                            if err or os.path.exists(metric_file) or os.path.exists(
-                                    model_info_file):
-                                logger.info(
-                                    f"Skipping exp {exp['name']} whose result already exists"
-                                )
+                            if err or os.path.exists(metric_file) or os.path.exists(model_info_file):
+                                logger.info(f"Skipping exp {exp['name']} whose result already exists")
                                 continue
 
                     self.experiment_queue.append(exp)
@@ -113,11 +105,7 @@ class ResourceManager:
                     user_args.append(val)
                     user_args.append(str(nval))
 
-        t = threading.Thread(target=run_experiment,
-                             args=(exp,
-                                   reservations,
-                                   user_script,
-                                   user_args))
+        t = threading.Thread(target=run_experiment, args=(exp, reservations, user_script, user_args))
         t.start()
         self.running_experiments[exp_id] = (t, exp, reservations, time.time())
 
@@ -270,6 +258,7 @@ class ResourceManager:
 
 
 class Node:
+
     def __init__(self, host, max_slots):
         self.host = host
         self.max_slots = max_slots
@@ -284,6 +273,7 @@ class Node:
 
 
 class Reservation:
+
     def __init__(self, node, slots):
         self.node = node
         self.slots = slots
@@ -389,9 +379,8 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         f"Launching exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}, and ds_config = {os.path.abspath(ds_config_path)}"
     )
 
-    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(
-        os.path.join(exp_dir, "stderr.log"), "wb"
-    ) as err:
+    with open(os.path.join(exp_dir, "stdout.log"), "wb") as out, open(os.path.join(exp_dir, "stderr.log"),
+                                                                      "wb") as err:
         result = subprocess.Popen(cmd, stdout=out, stderr=err)
         result.wait()
         out.flush()
@@ -401,9 +390,7 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
 
     clean_up(exp, reservations)
 
-    logger.info(
-        f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}"
-    )
+    logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}")
 
 
 PDSH_MAX_FAN_OUT = 1024
@@ -417,8 +404,7 @@ def clean_up(exp: dict, reservations):
     for reservation in reservations:
         nodes_str += f"{reservation.node.host},"
     nodes_str = nodes_str[:-1]
-    logger.debug(
-        f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
+    logger.debug(f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
 
     # PDSH flags for max node fan out and specific hosts to launch on
     # See https://linux.die.net/man/1/pdsh for flag details
@@ -441,6 +427,4 @@ def clean_up(exp: dict, reservations):
     if result.returncode > 0:
         sys.exit(result.returncode)
 
-    logger.info(
-        f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}"
-    )
+    logger.info(f"Done cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
diff --git a/deepspeed/autotuning/tuner/__init__.py b/deepspeed/autotuning/tuner/__init__.py
index 9f2e5675d9fd8a124bf81014152ee6bc6fa386f4..676ae429e07745d3ad24051a2610e57ac42601f0 100755
--- a/deepspeed/autotuning/tuner/__init__.py
+++ b/deepspeed/autotuning/tuner/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .index_based_tuner import RandomTuner, GridSearchTuner
 # from .ga_tuner import GATuner
diff --git a/deepspeed/autotuning/tuner/base_tuner.py b/deepspeed/autotuning/tuner/base_tuner.py
index 89eb2d384c94e3dc851bdb5a1e6ef08a3d21ac88..3ac7389810fc57124ada208bbe9f53886d55c6a3 100755
--- a/deepspeed/autotuning/tuner/base_tuner.py
+++ b/deepspeed/autotuning/tuner/base_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 
@@ -8,6 +11,7 @@ from deepspeed.utils import logger
 
 
 class BaseTuner:
+
     def __init__(self, exps, resource_manager, metric):
         self.all_exps = exps
         self.rm = resource_manager
@@ -42,8 +46,8 @@ class BaseTuner:
                 self.rm.schedule_experiments(exp_paths)
                 self.rm.run()
                 exp, metric_val = self.rm.parse_results(self.metric)
-                if self.best_exp == None or self.best_metric_val == None or (
-                        metric_val and metric_val > self.best_metric_val):
+                if self.best_exp == None or self.best_metric_val == None or (metric_val
+                                                                             and metric_val > self.best_metric_val):
                     # logger.info(f"tuner finds better = {exp}")
                     self.best_exp = exp
                     self.best_metric_val = metric_val
diff --git a/deepspeed/autotuning/tuner/cost_model.py b/deepspeed/autotuning/tuner/cost_model.py
index 858ab6d3d1ea29cc0da63eb2671c69ee47dff909..c12b10f743632c36a61711a411e8bb706041b762 100755
--- a/deepspeed/autotuning/tuner/cost_model.py
+++ b/deepspeed/autotuning/tuner/cost_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import *
 
@@ -9,6 +12,7 @@ except ImportError:
 
 
 class XGBoostCostModel():
+
     def __init__(self, loss_type, num_threads=None, log_interval=25, upper_model=None):
 
         assert xgb is not None, "missing requirements, please install deepspeed w. 'autotuning_ml' extra."
diff --git a/deepspeed/autotuning/tuner/index_based_tuner.py b/deepspeed/autotuning/tuner/index_based_tuner.py
index f19694871c538f985982c8fc141c3853285a1b26..d3c822be0d35ff68a42355322797cc4b0c8c1429 100755
--- a/deepspeed/autotuning/tuner/index_based_tuner.py
+++ b/deepspeed/autotuning/tuner/index_based_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import random
 
@@ -7,6 +10,7 @@ from .base_tuner import BaseTuner
 
 class RandomTuner(BaseTuner):
     """Explore the search space in random order"""
+
     def __init__(self, exps: list, resource_manager, metric):
         super().__init__(exps, resource_manager, metric)
 
@@ -22,6 +26,7 @@ class RandomTuner(BaseTuner):
 
 class GridSearchTuner(BaseTuner):
     """Explore the search space in sequential order"""
+
     def __init__(self, exps: list, resource_manager, metric):
         super().__init__(exps, resource_manager, metric)
 
diff --git a/deepspeed/autotuning/tuner/model_based_tuner.py b/deepspeed/autotuning/tuner/model_based_tuner.py
index ec475005abe007ffe5918a46edcca697e0f3da87..23f224b5eba20fe20f0268495a819cfcf2027ecb 100755
--- a/deepspeed/autotuning/tuner/model_based_tuner.py
+++ b/deepspeed/autotuning/tuner/model_based_tuner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import hjson
 
@@ -15,6 +18,7 @@ INIT_NUM = 2
 
 class ModelBasedTuner(BaseTuner):
     """Exploring the search space with a cost model"""
+
     def __init__(self, exps: list, resource_manager, metric, tuning_sapce):
         super().__init__(exps, resource_manager, metric)
         self.tuning_space = tuning_sapce
@@ -25,8 +29,7 @@ class ModelBasedTuner(BaseTuner):
 
         self.dims = dict_to_dims(self.tuning_space)
 
-        logger.info(
-            f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
+        logger.info(f"Create config dim: {self.dims}, all configs: {self.num_all_configs}")
 
         self.visited = set([])
 
@@ -71,9 +74,7 @@ class ModelBasedTuner(BaseTuner):
 
         n = len(estimates)
         top_idx = np.argsort(estimates)
-        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::
-                                                                                       -1][:
-                                                                                           n]
+        top_idx_ret = top_idx if self.metric == AUTOTUNING_METRIC_LATENCY else top_idx[::-1][:n]
 
         # top_configs = [self.all_configs[i] for i in top_idx]
 
@@ -145,9 +146,7 @@ class ModelBasedTuner(BaseTuner):
                 self.evaluated_configs.append(feature_val)
                 self.evaluated_perf.append(curr_iter)
 
-        logger.debug(
-            f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}"
-        )
+        logger.debug(f"**Evaluated configs: {len(self.evaluated_configs)}, evaluated perf: {self.evaluated_perf}")
 
         self.cost_model.fit(self.evaluated_configs, self.evaluated_perf)
 
diff --git a/deepspeed/autotuning/tuner/utils.py b/deepspeed/autotuning/tuner/utils.py
index f6719f0b37d2d6e9a7164c7dad0d8e717c10da76..ada643f2c02ce9b0433c89295e932f9f49740eac 100755
--- a/deepspeed/autotuning/tuner/utils.py
+++ b/deepspeed/autotuning/tuner/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 import itertools
@@ -44,9 +47,7 @@ def gen_combinations(d: dict):
     for v in values:
         if not isinstance(v, list):
             v = [v]
-    values_choices = (gen_combinations(v) if isinstance(v,
-                                                        dict) else get_list(v)
-                      for v in values)
+    values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values)
     for comb in itertools.product(*values_choices):
         yield dict(zip(keys, comb))
 
diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
index 40da7ec9613ac486b6d37e7270502bd570e246a3..dec13ca7f6212abb2c6fe12a380ce9e533176c60 100644
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import re
 import collections.abc
@@ -176,6 +179,7 @@ def fetch_hostfile(hostfile_path):
 
 
 def validate_ds_config(config: dict):
+
     def is_False(config: dict, key):
         if config is None:
             return False
@@ -189,9 +193,7 @@ def validate_ds_config(config: dict):
     if stage == 1:
         return True
     elif stage == 2:
-        if is_False(config_zero,
-                    "cpu_offload") and is_False(config_zero,
-                                                "cpu_offload_params"):
+        if is_False(config_zero, "cpu_offload") and is_False(config_zero, "cpu_offload_params"):
             return False
     elif stage == 3:
         offload_devices = ["cpu", "nvme"]
@@ -289,14 +291,13 @@ def get_all_configs(tuning_space: dict, ignore_keys=None):
     Args:
         tuning_space (dict): the tuning space where tunable parameters are lists of values.
     """
+
     def gen_combinations(d: dict):
         keys, values = d.keys(), d.values()
         for v in values:
             if not isinstance(v, list):
                 v = [v]
-        values_choices = (gen_combinations(v) if isinstance(v,
-                                                            dict) else get_list(v)
-                          for v in values)
+        values_choices = (gen_combinations(v) if isinstance(v, dict) else get_list(v) for v in values)
         for comb in itertools.product(*values_choices):
             yield dict(zip(keys, comb))
 
diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py
index 2ef0817c724eee9ca827e710b7b7e737cae74324..c9822693867d9e120e6d52064aea7e30fb169956 100644
--- a/deepspeed/checkpoint/__init__.py
+++ b/deepspeed/checkpoint/__init__.py
@@ -1,12 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .reshape_meg_2d import reshape_meg_2d_parallel
 
 from .deepspeed_checkpoint import DeepSpeedCheckpoint
 
-from .utils import (get_layer_ckpt_name_for_rank,
-                    get_model_ckpt_name_for_rank,
-                    get_zero_ckpt_name_for_rank)
+from .utils import (get_layer_ckpt_name_for_rank, get_model_ckpt_name_for_rank, get_zero_ckpt_name_for_rank)
 
 from .reshape_utils import (merge_state)
 
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index 787398486ca2132c89658e44847ee9a3e9dd472e..af6febad1aff651a16c13ebc7d66c45d9f9d7095 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -1,7 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-'''
-    Various symbolic constants used for model checkpointing
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Various symbolic constants used for model checkpointing
+"""
 
 #########################################
 # Optimizer checkpoint keys
@@ -24,6 +27,8 @@ FP32_WEIGHT_KEY = "fp32"
 PARAM = 'param'
 PARAM_SHAPES = 'param_shapes'
 BUFFER_NAMES = 'buffer_names'
+FROZEN_PARAM_SHAPES = 'frozen_param_shapes'
+FROZEN_PARAM_FRAGMENTS = 'frozen_param_fragments'
 
 #########################################
 # Checkpoint naming constants
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index c1a31b0fb4a5970ad500558cbbbcbe4432af1337..ef36b0c5ef3f2bc6f77cd68d16f2e6bad6b5e556 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -1,15 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from typing import Dict
 import torch
 
 from .reshape_3d_utils import model_3d_desc
-from .reshape_utils import (basic_folder_validation,
-                            merge_state,
-                            partition_data,
-                            get_files,
-                            get_files_with_prefix)
+from .reshape_utils import (basic_folder_validation, merge_state, partition_data, get_files, get_files_with_prefix)
 
 from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
 
@@ -24,19 +23,15 @@ CHECKPOINT_INFO_KEY = 'checkpoint_info'
 ITERATION_KEY = 'iteration'
 
 SEQUENTIAL_LAYERS = [
-    'input_layernorm.weight',
-    'input_layernorm.bias',
-    'self_attention.dense.bias',
-    'post_attention_layernorm.weight',
-    'post_attention_layernorm.bias',
-    'mlp.dense_4h_to_h.bias',
-    'position_embeddings.weight'
+    'input_layernorm.weight', 'input_layernorm.bias', 'self_attention.dense.bias', 'post_attention_layernorm.weight',
+    'post_attention_layernorm.bias', 'mlp.dense_4h_to_h.bias', 'position_embeddings.weight'
 ]
 
 LAYER_CONCAT_DIM = {'self_attention.dense.weight': 1, 'mlp.dense_4h_to_h.weight': 1}
 
 
 class DeepSpeedCheckpoint(object):
+
     def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.dir = dir
         self._validate_folder(dir)
@@ -50,33 +45,24 @@ class DeepSpeedCheckpoint(object):
         self.layer_keys = self._get_layer_keys()
         self.layer_count = len(self.layer_keys)
 
-        self.tp_degree = self.zero_checkpoint.get_src_tp_degree(
-        ) if tp_degree is None else tp_degree
-        self.pp_degree = self.zero_checkpoint.get_src_pp_degree(
-        ) if pp_degree is None else pp_degree
-        self.dp_degree = self.zero_checkpoint.get_src_dp_degree(
-        ) if dp_degree is None else dp_degree
+        self.tp_degree = self.zero_checkpoint.get_src_tp_degree() if tp_degree is None else tp_degree
+        self.pp_degree = self.zero_checkpoint.get_src_pp_degree() if pp_degree is None else pp_degree
+        self.dp_degree = self.zero_checkpoint.get_src_dp_degree() if dp_degree is None else dp_degree
 
-        self.original_world_size = self.zero_checkpoint.get_src_tp_degree(
-        ) * self.zero_checkpoint.get_src_pp_degree(
+        self.original_world_size = self.zero_checkpoint.get_src_tp_degree() * self.zero_checkpoint.get_src_pp_degree(
         ) * self.zero_checkpoint.get_src_dp_degree()
         self.world_size = self.tp_degree * self.pp_degree * self.dp_degree
 
         self.old_2d_map = meg_2d_parallel_map(self.zero_checkpoint.get_src_pp_degree(),
                                               self.zero_checkpoint.get_src_tp_degree())
         self.old_2d_map.simple_init()
-        self.new_2d_map = reshape_meg_2d_parallel(
-            old_pp_degree=self.zero_checkpoint.get_src_pp_degree(),
-            old_tp_degree=self.zero_checkpoint.get_src_tp_degree(),
-            new_pp_degree=self.pp_degree,
-            new_tp_degree=self.tp_degree)
-
-        if self.is_change_pp_degree() or self.is_change_tp_degree(
-        ) or self.is_change_dp_degree():
-            self.zero_checkpoint.reshape(
-                model_3d_desc(self.pp_degree,
-                              self.tp_degree,
-                              self.dp_degree))
+        self.new_2d_map = reshape_meg_2d_parallel(old_pp_degree=self.zero_checkpoint.get_src_pp_degree(),
+                                                  old_tp_degree=self.zero_checkpoint.get_src_tp_degree(),
+                                                  new_pp_degree=self.pp_degree,
+                                                  new_tp_degree=self.tp_degree)
+
+        if self.is_change_pp_degree() or self.is_change_tp_degree() or self.is_change_dp_degree():
+            self.zero_checkpoint.reshape(model_3d_desc(self.pp_degree, self.tp_degree, self.dp_degree))
 
         self.global_state = {}
 
@@ -84,8 +70,7 @@ class DeepSpeedCheckpoint(object):
         self.pp_to_transformer_map = self._build_pp_transformer_map()
         self.transformer_file_map = self._build_transformer_file_map()
         self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-        self.tp_to_final_norm_map = self._build_tp_other_layer_map(
-            FINAL_LAYER_NORM_INDEX)
+        self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
         self._build_global_state()
 
     def is_change_tp_degree(self):
@@ -131,9 +116,7 @@ class DeepSpeedCheckpoint(object):
                                                        keys_to_ignore=[PARAM_SHAPES])
 
     def get_zero_files(self, pp_index, tp_index, dp_index) -> list:
-        return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index,
-                                                       tp_index=tp_index,
-                                                       dp_index=dp_index)
+        return self.zero_checkpoint.get_files_for_rank(pp_index=pp_index, tp_index=tp_index, dp_index=dp_index)
 
     def get_embedding_layer_id(self):
         return self.layer_keys[EMBEDDING_LAYER_INDEX]
@@ -150,11 +133,7 @@ class DeepSpeedCheckpoint(object):
 
     def get_embedding_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [
-            torch.load(fname,
-                       map_location=torch.device('cpu'))
-            for fname in self.tp_to_embedding_map[tp_index]
-        ]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
         sd = self._merge_state_dicts(sd_list)
         return sd
 
@@ -179,10 +158,7 @@ class DeepSpeedCheckpoint(object):
         assert tp_index < self.tp_degree
         assert pp_index < self.pp_degree
         fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
-        sd_list = [
-            torch.load(fname,
-                       map_location=torch.device('cpu')) for fname in fname_list
-        ]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
 
         merged_sd = None
         for sd in sd_list:
@@ -198,10 +174,7 @@ class DeepSpeedCheckpoint(object):
         assert pp_index < self.pp_degree
         t_list = []
         for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [
-                torch.load(fname,
-                           map_location=torch.device('cpu')) for fname in fname_list
-            ]
+            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
         return t_list
@@ -212,8 +185,7 @@ class DeepSpeedCheckpoint(object):
 
     def get_final_norm_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0],
-                        map_location=torch.device('cpu'))
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
         return sd
 
     def get_final_norm_files(self, tp_index: int) -> list:
@@ -222,8 +194,7 @@ class DeepSpeedCheckpoint(object):
 
     def _build_tp_other_layer_map(self, layer_index: int):
         assert layer_index < len(self.layer_files)
-        layer_files = get_files_with_prefix(self.layer_files,
-                                            self.layer_keys[layer_index])
+        layer_files = get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
         layer_file_partitions = partition_data(layer_files, self.tp_degree)
         data_map = {i: flist for i, flist in enumerate(layer_file_partitions)}
         return data_map
@@ -238,11 +209,7 @@ class DeepSpeedCheckpoint(object):
         data_map = {}
         transformer_layers = self.layer_keys[1:-1]
         layers_per_pp = len(transformer_layers) // self.pp_degree
-        data_map = {
-            i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp]
-            for i in range(0,
-                           self.pp_degree)
-        }
+        data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)}
         return data_map
 
     def _dump_mapping(self, data_map, map_tag=None):
@@ -308,10 +275,8 @@ class DeepSpeedCheckpoint(object):
 
         file_list = get_files(dir)
 
-        for file_prefix in [
-                MODEL_FILE_PREFIX,
-                LAYER_FILE_PREFIX,
-                f'{LAYER_FILE_PREFIX}01'
-        ]:
+        for file_prefix in [MODEL_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']:
             ckpt_files = get_files_with_prefix(file_list, file_prefix)
-            assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
+            assert len(
+                ckpt_files
+            ) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
diff --git a/deepspeed/checkpoint/reshape_3d_utils.py b/deepspeed/checkpoint/reshape_3d_utils.py
index 15faffb2a680bb310c8a31986977d4cc85a0af95..b5bf41e2d1604e71bbd9b383c03bc59ef73a0cec 100644
--- a/deepspeed/checkpoint/reshape_3d_utils.py
+++ b/deepspeed/checkpoint/reshape_3d_utils.py
@@ -1,9 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-from .reshape_utils import (get_files,
-                            get_files_with_prefix,
-                            partition_data,
-                            get_zero_files)
+# DeepSpeed Team
+
+from .reshape_utils import (get_files, get_files_with_prefix, partition_data, get_zero_files)
 
 from .constants import (MODEL_FILE_PREFIX, LAYER_FILE_PREFIX)
 
@@ -15,6 +15,7 @@ DP_DIM = 'DP'
 
 
 class model_3d_desc(object):
+
     def __init__(self, pp_degree=1, tp_degree=1, dp_degree=1):
         self.pp_degree = pp_degree
         self.tp_degree = tp_degree
@@ -33,8 +34,7 @@ class model_3d_desc(object):
                                            src_2d_size=self.pp_degree * self.tp_degree,
                                            dp_degree=self.dp_degree)
 
-        return unflatten_dp_dimension(meg_2d_map=flat_3d_map,
-                                      dp_degree=target_3d_desc.dp_degree)
+        return unflatten_dp_dimension(meg_2d_map=flat_3d_map, dp_degree=target_3d_desc.dp_degree)
 
     def get_desc(self):
         return f'{PP_DIM},{TP_DIM},{DP_DIM} = ({self.pp_degree}, {self.tp_degree}, {self.dp_degree})'
@@ -45,14 +45,11 @@ class model_3d_desc(object):
     def is_valid(self, pp_index, tp_index, dp_index):
         err_msg = []
         valid = True
-        for index, degree, dim_name in [
-            (pp_index, self.pp_degree, PP_DIM),
-            (tp_index, self.tp_degree, TP_DIM),
-            (dp_index, self.dp_degree, DP_DIM)]:
+        for index, degree, dim_name in [(pp_index, self.pp_degree, PP_DIM), (tp_index, self.tp_degree, TP_DIM),
+                                        (dp_index, self.dp_degree, DP_DIM)]:
             if index >= degree:
                 valid = False
-                err_msg.append(
-                    f'{dim_name} indexing error: index {index} >= degree {degree}')
+                err_msg.append(f'{dim_name} indexing error: index {index} >= degree {degree}')
 
         return valid, err_msg
 
@@ -60,18 +57,15 @@ class model_3d_desc(object):
         err_msg = []
         if target_3d_desc.pp_degree > self.pp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}'
-            )
+                f'Expansion reshape not supported - {PP_DIM}: {self.pp_degree} ---> {target_3d_desc.pp_degree}')
 
         if target_3d_desc.tp_degree > self.tp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}'
-            )
+                f'Expansion reshape not supported - {TP_DIM}: {self.tp_degree} ---> {target_3d_desc.tp_degree}')
 
         if target_3d_desc.dp_degree > self.dp_degree:
             err_msg.append(
-                f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}'
-            )
+                f'Expansion reshape not supported - {DP_DIM}: {self.dp_degree} ---> {target_3d_desc.dp_degree}')
 
         return len(err_msg) == 0, err_msg
 
@@ -106,10 +100,7 @@ def flatten_dp_dimension(meg_2d_map, src_2d_size, dp_degree):
 def unflatten_dp_dimension(meg_2d_map, dp_degree):
     pp_degree = meg_2d_map.pp_degree
     tp_degree = meg_2d_map.tp_degree
-    meg_2d_map_list = [
-        meg_2d_parallel_map(pp_degree=pp_degree,
-                            tp_degree=tp_degree) for _ in range(dp_degree)
-    ]
+    meg_2d_map_list = [meg_2d_parallel_map(pp_degree=pp_degree, tp_degree=tp_degree) for _ in range(dp_degree)]
     for pp_index in range(pp_degree):
         for tp_index in range(tp_degree):
             flat_dp_indices = meg_2d_map.get_data(pp_index, tp_index)
diff --git a/deepspeed/checkpoint/reshape_meg_2d.py b/deepspeed/checkpoint/reshape_meg_2d.py
index 4800b55583b83b6d15ab085c9e921d25b653d421..3bff87f4344f4a1e74b348ae77f9b9a7d9212c6e 100644
--- a/deepspeed/checkpoint/reshape_meg_2d.py
+++ b/deepspeed/checkpoint/reshape_meg_2d.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .reshape_utils import partition_data
 
 
 class meg_2d_parallel_map(object):
+
     def __init__(self, pp_degree, tp_degree):
         self.pp_degree = pp_degree
         self.tp_degree = tp_degree
@@ -11,8 +15,7 @@ class meg_2d_parallel_map(object):
 
     def simple_init(self):
         self.map = {
-            self._make_key(i // self.tp_degree,
-                           i % self.tp_degree): [i]
+            self._make_key(i // self.tp_degree, i % self.tp_degree): [i]
             for i in range(self.pp_degree * self.tp_degree)
         }
 
@@ -74,11 +77,7 @@ def _reshape_pp_dimension(old_2d_map, new_pp_degree):
     return new_2d_map
 
 
-def reshape_meg_2d_parallel(old_pp_degree,
-                            old_tp_degree,
-                            new_pp_degree,
-                            new_tp_degree,
-                            verbose=False):
+def reshape_meg_2d_parallel(old_pp_degree, old_tp_degree, new_pp_degree, new_tp_degree, verbose=False):
     assert new_pp_degree <= old_pp_degree
     assert new_tp_degree <= old_tp_degree
 
@@ -137,8 +136,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
 
     tensor_model_parallel_size = min(tp_size, world_size)
     pipeline_model_parallel_size = min(pp_size, world_size)
-    data_parallel_size = world_size // (tensor_model_parallel_size *
-                                        pipeline_model_parallel_size)
+    data_parallel_size = world_size // (tensor_model_parallel_size * pipeline_model_parallel_size)
 
     num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
     num_pipeline_model_parallel_groups = world_size // pipeline_model_parallel_size
@@ -158,10 +156,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
     # Build the model-parallel groups.
     all_pp_group_ranks = []
     for i in range(data_parallel_size):
-        ranks = [
-            data_parallel_group_ranks[i]
-            for data_parallel_group_ranks in all_dp_group_ranks
-        ]
+        ranks = [data_parallel_group_ranks[i] for data_parallel_group_ranks in all_dp_group_ranks]
         all_pp_group_ranks.append(list(ranks))
 
     print(f"PP", all_pp_group_ranks)
@@ -169,8 +164,7 @@ def get_mpu_ranks(tp_size=1, pp_size=1, dp_size=1, virtual_pp_size=None):
     # Build the tensor model-parallel groups.
     all_tp_group_ranks = []
     for i in range(num_tensor_model_parallel_groups):
-        ranks = range(i * tensor_model_parallel_size,
-                      (i + 1) * tensor_model_parallel_size)
+        ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
         all_tp_group_ranks.append(list(ranks))
 
     print(f"TP", all_tp_group_ranks)
diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py
index 4d6b272f43e3ef202334eef8fa46960fc5b7cf23..15b6ce28b2fd35770cf574c311eea370a146a395 100644
--- a/deepspeed/checkpoint/reshape_utils.py
+++ b/deepspeed/checkpoint/reshape_utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -49,11 +52,7 @@ def partition_data(data_list, num_partitions):
     num_elems = len(data_list)
     assert num_elems % num_partitions == 0
     partition_size = num_elems // num_partitions
-    partitions_list = [
-        data_list[i:i + partition_size] for i in range(0,
-                                                       num_elems,
-                                                       partition_size)
-    ]
+    partitions_list = [data_list[i:i + partition_size] for i in range(0, num_elems, partition_size)]
     return partitions_list
 
 
@@ -76,9 +75,7 @@ def merge_state_dict(dict_a, dict_b, key_list):
 def merge_state_list(list_a, list_b, key_list):
     if len(list_a) != len(list_b):
         print(f'{_key_list_to_string(key_list)}')
-        raise ValueError(
-            f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}'
-        )
+        raise ValueError(f'Cannot merge lists of different lengths, a = {len(list_a)} b = {len(list_b)}')
 
     return [merge_state(a, b, key_list) for a, b in zip(list_a, list_b)]
 
@@ -87,8 +84,7 @@ def merge_state(state_a, state_b, key_list=[]):
     if type(state_a) != type(state_b):
         key_list_string = _key_list_to_string(key_list)
         print(f'key_list = {key_list_string}')
-        raise ValueError(
-            f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}')
+        raise ValueError(f'Cannot merge two states of types {type(state_a)} and type {type(state_b)}')
 
     if type(state_a) in (dict, OrderedDict):
         return merge_state_dict(state_a, state_b, key_list)
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index b58de4871031f0a26a6a88c332015d14fb85d5b9..cd4d7d51a4c2755e0a8ec56ef96695027774c4c1 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -1,13 +1,12 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import torch
 import types
-from .constants import (FP32_WEIGHT_KEY,
-                        PARAM,
-                        VOCAB_DIVISIBILITY_PADDING_TENSOR,
-                        CAT_DIM)
+from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_DIVISIBILITY_PADDING_TENSOR, CAT_DIM)
 
 
 def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
@@ -44,9 +43,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
         # the converter to universal currently strips the original padding completely so the saved
         # weight is padding-free and we just need to add new padding depending on the target TP
         # degree
-        vocab_divisibility_padding_tensor = ckpt_dict.get(
-            VOCAB_DIVISIBILITY_PADDING_TENSOR,
-            None)
+        vocab_divisibility_padding_tensor = ckpt_dict.get(VOCAB_DIVISIBILITY_PADDING_TENSOR, None)
         if vocab_divisibility_padding_tensor is not None:
             # In the absence of data passed from the user wrt new padded vocab specific to tp degree
             # we can again derive that data by reverse engineering the target shapes like so:
@@ -56,13 +53,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
                 padding_size = padded_target_vocab_size - full_hp_param.shape[0]
                 # Implement the following concat in efficient way using pad
                 #full_hp_param = torch.cat((full_hp_param, padding_tensor), 0)
-                full_hp_param = torch.nn.functional.pad(full_hp_param,
-                                                        (0,
-                                                         0,
-                                                         0,
-                                                         padding_size),
-                                                        "constant",
-                                                        0)
+                full_hp_param = torch.nn.functional.pad(full_hp_param, (0, 0, 0, padding_size), "constant", 0)
                 full_hp_param[:-padding_size, :] = vocab_divisibility_padding_tensor
             else:
                 # Need to shrink or keep the same
@@ -76,8 +67,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
         assert full_param_numel == tp_world_size * tp_slice_numel, \
             f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}'
-        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(
-            key)
+        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(key)
 
         #        print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}")
         #        print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}")
@@ -90,9 +80,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
         tp_hp_slice = tp_hp_slice.flatten()
 
         lp_frag_address = hp_mapping.lp_fragment_address
-        tp_hp_fragment = tp_hp_slice.narrow(0,
-                                            lp_frag_address.start,
-                                            lp_frag_address.numel)
+        tp_hp_fragment = tp_hp_slice.narrow(0, lp_frag_address.start, lp_frag_address.numel)
         assert dst_tensor.numel() == lp_frag_address.numel, \
             f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}'
 
@@ -104,5 +92,4 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
 def enable_universal_checkpoint(param_list):
     for param in param_list:
-        param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state,
-                                                          param)
+        param.load_hp_checkpoint_state = types.MethodType(load_hp_checkpoint_state, param)
diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
index 4e96d073067de72e7bbf21c50a2bc7daf9b8cfd4..3707aa0eb4195f64288625ab97ce14bdf086fae2 100644
--- a/deepspeed/checkpoint/utils.py
+++ b/deepspeed/checkpoint/utils.py
@@ -1,10 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
-from .constants import (MODEL_FILE_PREFIX,
-                        MODEL_FILE_SUFFIX,
-                        OPTIM_FILE_SUFFIX,
-                        ZERO_FILE_PREFIX)
+from .constants import (MODEL_FILE_PREFIX, MODEL_FILE_SUFFIX, OPTIM_FILE_SUFFIX, ZERO_FILE_PREFIX)
 
 
 def get_model_ckpt_name_for_rank(base_folder, mp_rank_str):
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index cb33e8e376716860455d899c186ba10d58f4bc50..c65745d3dd0c13847a2b2f3c7666e396d8a30202 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -1,11 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
-from .constants import (BASE_OPTIMIZER_STATE,
-                        GROUP_PADDINGS,
-                        OPTIMIZER_STATE_DICT,
-                        PARTITION_COUNT)
+from .constants import (BASE_OPTIMIZER_STATE, GROUP_PADDINGS, OPTIMIZER_STATE_DICT, PARTITION_COUNT)
 
 from .reshape_utils import (basic_folder_validation, get_zero_files, merge_state)
 
@@ -15,6 +15,7 @@ GROUP_STATE_KEY = 'state'
 
 
 class ZeROCheckpoint(object):
+
     def __init__(self, dir):
         basic_folder_validation(dir)
         self.dir = dir
@@ -49,12 +50,7 @@ class ZeROCheckpoint(object):
         file_idx_list = self.get_file_indices_for_rank(pp_index, tp_index, dp_index)
         return [self.file_list[idx] for idx in file_idx_list]
 
-    def get_state_for_rank(self,
-                           pp_index,
-                           tp_index,
-                           dp_index,
-                           keys_to_ignore=[],
-                           strip_tensor_paddings=True):
+    def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], strip_tensor_paddings=True):
         state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
         merged_sd = None
         for state_file in state_file_list:
@@ -111,10 +107,7 @@ class ZeROCheckpoint(object):
             for state_name, state_value in group_state.items():
                 if torch.is_tensor(state_value):
                     raw_length = state_value.numel() - group_paddings[key]
-                    group_state[state_name] = torch.narrow(state_value,
-                                                           0,
-                                                           0,
-                                                           raw_length).clone()
+                    group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone()
 
     def _clear_group_paddings(self, sd):
         group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS)
@@ -144,5 +137,4 @@ class ZeROCheckpoint(object):
         partition_counts = self._get_optimizer_state(sd, PARTITION_COUNT)
         if partition_counts:
             num_groups = len(partition_counts)
-            sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree
-                                                         ] * num_groups
+            sd[OPTIMIZER_STATE_DICT][PARTITION_COUNT] = [self.target_3d.dp_degree] * num_groups
diff --git a/deepspeed/comm/__init__.py b/deepspeed/comm/__init__.py
index 84d0f54c76a11096bc362d24887ec8c7c7f5f1f9..2c26ef4913085748dc10d720bb723d784ee9335e 100644
--- a/deepspeed/comm/__init__.py
+++ b/deepspeed/comm/__init__.py
@@ -1,52 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
-import torch
-from .utils import *
-from deepspeed import utils
-
-supported_torch_version = False
-
-# See more details at: https://github.com/pytorch/pytorch/pull/48767
-# The PG API in torch versions lesser than 1.8 are different so it is
-# non-trivial to support both in the same API. We will just use the
-# DS comm. backend in deepspeed/comm/comm.py if torch version if 1.8+.
-
-if older_torch():
-    # Add custom deepspeed torch comm functions here since we can't import deepspeed.comm
-    # NOTE: We can't call torch.distributed directly here. Current hack is to import functions before calling them.
-    supported_torch_version = False
-    from torch.distributed import *
-
-    def get_world_group():
-        return group.WORLD
+# DeepSpeed Team
 
-    def get_global_rank(group, group_rank):
-        if hasattr(torch.distributed.distributed_c10d, "get_global_rank"):
-            from torch.distributed.distributed_c10d import get_global_rank as _get_global_rank
-        else:
-            from torch.distributed.distributed_c10d import _get_global_rank
-        return _get_global_rank(group, group_rank)
-
-    def allgather_fn(output_tensor, input_tensor, group=None, async_op=False):
-        from torch.distributed import all_gather, get_world_size
-        from torch import chunk
-        output_tensors = list(chunk(output_tensor, get_world_size(group)))
-        return all_gather(output_tensors, input_tensor, group=group, async_op=async_op)
-
-    def reduce_scatter_fn(output_tensor, input_tensor, group=None, async_op=False):
-        from torch.distributed import reduce_scatter, get_world_size
-        from torch import chunk
-        input_tensor_lst = list(chunk(input_tensor, get_world_size(group)))
-        return reduce_scatter(output_tensor, input_tensor_lst, group=group)
-
-    def configure(deepspeed_config=None,
-                  enabled=None,
-                  prof_all=None,
-                  prof_ops=None,
-                  verbose=None):
-        utils.logger.warn(
-            "Communication logging is not supported in torch versions older than 1.8")
-
-else:
-    supported_torch_version = True
-    from .comm import *
+from .utils import *
+from .comm import *
diff --git a/deepspeed/comm/backend.py b/deepspeed/comm/backend.py
index cd436865c5ad79745b4aed4002e6a5db4aa3f5b8..4e02ad89719c0daea67802d6d9aa61c07cc460d6 100644
--- a/deepspeed/comm/backend.py
+++ b/deepspeed/comm/backend.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-''' DeepSpeed Communication Backend.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+DeepSpeed Communication Backend.
 
 In the future, directly use NCCL/MPI/Gloo/etc without requiring torch.distributed. Simply wrap torch.distributed for now.
 
@@ -8,18 +12,18 @@ In the future, directly use NCCL/MPI/Gloo/etc without requiring torch.distribute
  - MPI -- [EXPERIMENTAL]
  - RCCL -- [EXPERIMENTAL]
  - GLOO -- [EXPERIMENTAL]
-
 # DS backend wrapper for torch.distributed [DEFAULT]
  - T-NCCL -- [DEFAULT]
  - T-GLOO
  - T-MPI
-'''
-''' Backend is the base class
+
+# Backend is the base class
      -- NcclBackend, MpiBackend, and TorchBackend are the main subclasses. TorchBackend is the only officially supported backend for now.
-'''
+"""
 
 
 class Backend(object):
+
     def __init__(self, name='backend', rank=0, size=1):
         self.name = name
         # The world size and rank of the world process group
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
index 302e3eaf2210a6c399d41571e9b92eb21f0b0fce..913dcf84d681e777bc99145097704269c2ac10cf 100644
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -1,26 +1,22 @@
-"""
-    Copyright 2021 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
     DeepSpeed Communication Package: deepspeed.comm
-
     deepspeed.comm
         -- import and use deepspeeed.ops.comm
         -- use torch.distributed directly if both this package and torch.distributed use the same NCCL version
         -- use custom collectives
             -- can either use torch.dist or ds.ops.comm?
-
         Note: the old 1-bit compressed allreduce variants that resided in deepspeed.runtime.comm will be moved here as well.
-
     deepspeed.comm API
         -- must be kept fully compatible (same signatures) as torch.dist API to ensure backward/cross-framework compatibility.
         -- e.g. if a client code used
             from deepspeed import comm as dist
-
             instead of
             import torch.distributed as dist
-
             The code should work without breaking any of the public torch.distributed functionality
-
     Future:
         -- deepspeed groups API should be brought into ds.comm
 """
@@ -63,10 +59,6 @@ timer_summary = {}
 
 comms_logger = CommsLogger()
 
-# Ensure we don't warn about base collectives more than once
-has_warned_all_gather = False
-has_warned_reduce_scatter = False
-
 # Maintain objects of all initialized ds backends and assign them using the API functions in this file
 nccl_backend = None
 mpi_backend = None
@@ -110,12 +102,13 @@ def configure(
 
 # Logging wrapper for timing ops
 def timed_op(func):
+
     def log_wrapper(*args, **kwargs):
         # Add enabled flag so that overhead to each comm op is two if conditions at most
         if comms_logger.enabled:
-            if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
-                    'log_name' in kwargs
-                    and kwargs['log_name'] in comms_logger.prof_ops):
+            if ('prof' in kwargs
+                    and kwargs['prof']) or comms_logger.prof_all or ('log_name' in kwargs
+                                                                     and kwargs['log_name'] in comms_logger.prof_ops):
                 # Need func args for their defaults
                 func_args = get_default_args(func)
                 func_args.update(kwargs)
@@ -133,8 +126,7 @@ def timed_op(func):
                 if cdb.using_mpi:
                     cdb.barrier()
                 if ('prof' in kwargs and kwargs['prof']) or comms_logger.prof_all or (
-                        'log_name' in kwargs
-                        and kwargs['log_name'] in comms_logger.prof_ops):
+                        'log_name' in kwargs and kwargs['log_name'] in comms_logger.prof_ops):
                     log_name = get_debug_log_name(func_args, comms_logger.debug)
                     raw_name = func.__name__
                     timers(log_name).stop()
@@ -182,7 +174,8 @@ def destroy_process_group(group=None):
 
 def new_group(ranks):
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.new_group(ranks)
 
 
@@ -196,14 +189,12 @@ def is_available() -> bool:
     return True
 
 
-def set_backend(backend):
+def set_backend(backend_name):
     if not use_ds_backend:
         utils.logger.error(
             "DeepSpeed communication backend is required. Please use deepspeed.comm.init_distributed(backend, use_deepspeed=True) to use this functionality"
         )
-        raise RuntimeError(
-            'Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.'
-        )
+        raise RuntimeError('Error: Custom DeepSpeed backend called without initializing DeepSpeed distributed.')
 
     global cdb
     global nccl_backend
@@ -221,13 +212,7 @@ def set_backend(backend):
 
 
 @timed_op
-def broadcast(tensor,
-              src,
-              group=None,
-              async_op=False,
-              prof=False,
-              log_name='broadcast',
-              debug=get_caller_func()):
+def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='broadcast', debug=get_caller_func()):
     global cdb
     return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
@@ -241,17 +226,14 @@ def all_gather(tensor_list,
                log_name='all_gather',
                debug=get_caller_func()):
     global cdb
-    return cdb.all_gather(tensor_list=tensor_list,
-                          tensor=tensor,
-                          group=group,
-                          async_op=async_op)
+    return cdb.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
 
-def has_reduce_scatter_base():
+def has_reduce_scatter_tensor():
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    assert cdb.has_reduce_scatter_base is not None, 'has_reduce_scatter_base is not yet defined'
-    return cdb.has_reduce_scatter_base
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.has_reduce_scatter_tensor()
 
 
 def reduce_scatter_fn(output_tensor,
@@ -262,23 +244,21 @@ def reduce_scatter_fn(output_tensor,
                       prof=False,
                       debug=get_caller_func()):
     global cdb
-    global has_warned_reduce_scatter
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    if cdb.has_reduce_scatter_base:
-        return reduce_scatter_base(output_tensor,
-                                   tensor,
-                                   op=op,
-                                   group=group,
-                                   async_op=async_op,
-                                   prof=prof,
-                                   debug=debug)
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_reduce_scatter_tensor():
+        return reduce_scatter_tensor(output_tensor,
+                                     tensor,
+                                     op=op,
+                                     group=group,
+                                     async_op=async_op,
+                                     prof=prof,
+                                     debug=debug)
     else:
-        if not has_warned_reduce_scatter:
-            utils.logger.warning(
-                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
-                "torch.distributed.all_gather which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
-            has_warned_reduce_scatter = True
+        if get_rank() == 0:
+            utils.logger.warning_once("unable to find torch.distributed.reduce_scatter_tensor. will fall back to "
+                                      "torch.distributed.all_gather which will result in suboptimal performance. "
+                                      "please consider upgrading your pytorch installation.")
         input_tensor_lst = list(torch.chunk(tensor, cdb.get_world_size(group)))
         return reduce_scatter(output_tensor,
                               input_tensor_lst,
@@ -290,71 +270,54 @@ def reduce_scatter_fn(output_tensor,
 
 
 @timed_op
-def reduce_scatter_base(output_tensor,
-                        tensor,
-                        op=ReduceOp.SUM,
-                        group=None,
-                        async_op=False,
-                        prof=False,
-                        log_name='reduce_scatter_base',
-                        debug=get_caller_func()):
-    global cdb
-    return cdb.reduce_scatter_base(output_tensor=output_tensor,
-                                   input_tensor=tensor,
-                                   op=op,
-                                   group=group,
-                                   async_op=async_op)
+def reduce_scatter_tensor(output_tensor,
+                          tensor,
+                          op=ReduceOp.SUM,
+                          group=None,
+                          async_op=False,
+                          prof=False,
+                          log_name='reduce_scatter_tensor',
+                          debug=get_caller_func()):
+    global cdb
+    return cdb.reduce_scatter_tensor(output_tensor=output_tensor,
+                                     input_tensor=tensor,
+                                     op=op,
+                                     group=group,
+                                     async_op=async_op)
 
 
 @timed_op
-def all_gather_base(output_tensor,
-                    tensor,
-                    group=None,
-                    async_op=False,
-                    prof=False,
-                    log_name='all_gather_base',
-                    debug=get_caller_func()):
-    global cdb
-    return cdb.all_gather_base(output_tensor=output_tensor,
-                               input_tensor=tensor,
-                               group=group,
-                               async_op=async_op)
-
-
-def has_allgather_base():
-    global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    assert cdb.has_allgather_base is not None, 'has_allgather_base is not yet defined'
-    return cdb.has_allgather_base
-
-
-def allgather_fn(output_tensor,
-                 input_tensor,
-                 group=None,
-                 async_op=False,
-                 debug=get_caller_func()):
-    global cdb
-    global has_warned_all_gather
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
-    if cdb.has_allgather_base:
-        return all_gather_base(output_tensor,
-                               input_tensor,
-                               group=group,
-                               async_op=async_op,
-                               debug=debug)
+def all_gather_into_tensor(output_tensor,
+                           tensor,
+                           group=None,
+                           async_op=False,
+                           prof=False,
+                           log_name='all_gather_into_tensor',
+                           debug=get_caller_func()):
+    global cdb
+    return cdb.all_gather_into_tensor(output_tensor=output_tensor, input_tensor=tensor, group=group, async_op=async_op)
+
+
+def has_all_gather_into_tensor():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.has_all_gather_into_tensor()
+
+
+def allgather_fn(output_tensor, input_tensor, group=None, async_op=False, debug=get_caller_func()):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    if cdb.has_all_gather_into_tensor():
+        return all_gather_into_tensor(output_tensor, input_tensor, group=group, async_op=async_op, debug=debug)
     else:
-        if not has_warned_all_gather and get_rank() == 0:
-            utils.logger.warning(
-                "unable to find torch.distributed._all_gather_base. will fall back to "
-                "torch.distributed.all_gather which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
-            has_warned_all_gather = True
+        if get_rank() == 0:
+            utils.logger.warning_once("unable to find torch.distributed.all_gather_into_tensor. will fall back to "
+                                      "torch.distributed.all_gather which will result in suboptimal performance. "
+                                      "please consider upgrading your pytorch installation.")
         output_tensors = list(torch.chunk(output_tensor, cdb.get_world_size(group)))
-        return all_gather(output_tensors,
-                          input_tensor,
-                          group=group,
-                          async_op=async_op,
-                          debug=debug)
+        return all_gather(output_tensors, input_tensor, group=group, async_op=async_op, debug=debug)
 
 
 @timed_op
@@ -377,49 +340,25 @@ def all_to_all_single(output,
 
 
 @timed_op
-def send(tensor,
-         dst,
-         group=None,
-         tag=0,
-         prof=False,
-         log_name='send',
-         debug=get_caller_func()):
+def send(tensor, dst, group=None, tag=0, prof=False, log_name='send', debug=get_caller_func()):
     global cdb
     return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
 
 @timed_op
-def recv(tensor,
-         src=None,
-         group=None,
-         tag=0,
-         prof=False,
-         log_name='recv',
-         debug=get_caller_func()):
+def recv(tensor, src=None, group=None, tag=0, prof=False, log_name='recv', debug=get_caller_func()):
     global cdb
     return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
 
 
 @timed_op
-def isend(tensor,
-          dst,
-          group=None,
-          tag=0,
-          prof=False,
-          log_name='isend',
-          debug=get_caller_func()):
+def isend(tensor, dst, group=None, tag=0, prof=False, log_name='isend', debug=get_caller_func()):
     global cdb
     return cdb.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
 
 @timed_op
-def irecv(tensor,
-          src=None,
-          group=None,
-          tag=0,
-          prof=False,
-          log_name='irecv',
-          debug=get_caller_func()):
+def irecv(tensor, src=None, group=None, tag=0, prof=False, log_name='irecv', debug=get_caller_func()):
     global cdb
     return cdb.recv(tensor=tensor, src=src, group=group, tag=tag)
 
@@ -434,11 +373,7 @@ def gather(tensor,
            log_name='gather',
            debug=get_caller_func()):
     global cdb
-    return cdb.gather(tensor=tensor,
-                      gather_list=gather_list,
-                      dst=dst,
-                      group=group,
-                      async_op=async_op)
+    return cdb.gather(tensor=tensor, gather_list=gather_list, dst=dst, group=group, async_op=async_op)
 
 
 @timed_op
@@ -451,20 +386,11 @@ def scatter(tensor,
             log_name='scatter',
             debug=get_caller_func()):
     global cdb
-    return cdb.scatter(tensor=tensor,
-                       scatter_list=scatter_list,
-                       src=src,
-                       group=group,
-                       async_op=async_op)
+    return cdb.scatter(tensor=tensor, scatter_list=scatter_list, src=src, group=group, async_op=async_op)
 
 
 @timed_op
-def barrier(group=None,
-            async_op=False,
-            device_ids=None,
-            prof=False,
-            log_name='barrier',
-            debug=get_caller_func()):
+def barrier(group=None, async_op=False, device_ids=None, prof=False, log_name='barrier', debug=get_caller_func()):
     global cdb
     return cdb.barrier(group=group, async_op=async_op, device_ids=device_ids)
 
@@ -511,11 +437,31 @@ def reduce_scatter(output,
                    log_name='reduce_scatter',
                    debug=get_caller_func()):
     global cdb
-    return cdb.reduce_scatter(output=output,
-                              input_list=input_list,
-                              op=op,
-                              group=group,
-                              async_op=async_op)
+    return cdb.reduce_scatter(output=output, input_list=input_list, op=op, group=group, async_op=async_op)
+
+
+def has_all_reduce_coalesced():
+    """"""
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_all_reduce_coalesced is not None, 'has_all_reduce_coalesced is not yet defined'
+    return cdb.has_all_reduce_coalesced
+
+
+def has_coalescing_manager():
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb.has_coalescing_manager is not None, 'has_coalescing_manager is not yet defined'
+    return cdb.has_coalescing_manager
+
+
+def all_gather_coalesced(output_tensors, input_tensors, group=None, async_op=False):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    return cdb.all_gather_coalesced(output_tensors, input_tensors, group=group, async_op=async_op)
 
 
 @timed_op
@@ -535,9 +481,22 @@ def all_reduce(tensor,
     return cdb.all_reduce(tensor, op, group, async_op)
 
 
+@timed_op
+def all_reduce_coalesced(tensors,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         async_op=False,
+                         prof=False,
+                         log_name='all_reduce',
+                         debug=get_caller_func()):
+    global cbd
+    return cdb.all_reduce_coalesced(tensors, op, group, async_op)
+
+
 def get_world_group():
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_world_group()
 
 
@@ -553,7 +512,8 @@ def get_world_size(group=None) -> int:
     """
     global cdb
 
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_world_size(group)
 
 
@@ -572,7 +532,8 @@ def get_rank(group=None):
         -1, if not part of the group
     """
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_rank(group)
 
 
@@ -585,13 +546,15 @@ def get_local_rank():
             local rank (= GPU device ID)
     """
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return get_local_rank_from_launcher()
 
 
 def get_global_rank(group=None, group_rank=0):
     global cdb
-    assert cdb is not None and cdb.is_initialized(), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
     return cdb.get_global_rank(group, group_rank)
 
 
@@ -640,9 +603,7 @@ def init_distributed(dist_backend=None,
         required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
         if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
             if verbose:
-                utils.logger.info(
-                    "Not using the DeepSpeed or dist launchers, attempting to detect MPI environment..."
-                )
+                utils.logger.info("Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...")
             if in_aml() and not in_dlts():
                 patch_aml_env_for_torch_nccl_backend(verbose=verbose)
             elif in_aws_sm():
@@ -658,9 +619,7 @@ def init_distributed(dist_backend=None,
             if dist_backend == None:
                 dist_backend = get_accelerator().communication_backend_name()
             if int(os.getenv('RANK', '0')) == 0:
-                utils.logger.info(
-                    'Initializing TorchBackend in DeepSpeed with backend {}'.format(
-                        dist_backend))
+                utils.logger.info('Initializing TorchBackend in DeepSpeed with backend {}'.format(dist_backend))
             # Create a torch backend object, initialize torch distributed, and assign to cdb
             cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
 
@@ -695,16 +654,12 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)
 
     if verbose:
         utils.logger.info(
-            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
+            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}".
+            format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
+                   os.environ['MASTER_PORT']))
 
     if cdb is not None and cdb.is_initialized():
-        assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(
-            rank, cdb.get_rank())
+        assert cdb.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(rank, cdb.get_rank())
         assert cdb.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
             world_size, cdb.get_world_size())
 
@@ -731,8 +686,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
     """
     os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
     os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
-        os.environ["WORLD_SIZE"])
+    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(os.environ["WORLD_SIZE"])
 
     if not single_node:
         master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
@@ -745,8 +699,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
         os.environ["MASTER_PORT"] = DEFAULT_AML_MASTER_PORT
 
     if verbose:
-        utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
-            os.environ["NCCL_SOCKET_IFNAME"]))
+        utils.logger.info("NCCL_SOCKET_IFNAME original value = {}".format(os.environ["NCCL_SOCKET_IFNAME"]))
 
     os.environ["NCCL_SOCKET_IFNAME"] = DEFAULT_AML_NCCL_SOCKET_IFNAME
     os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
@@ -754,10 +707,7 @@ def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
     if verbose:
         utils.logger.info(
             "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
+            .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
                     os.environ['MASTER_PORT']))
 
 
@@ -771,8 +721,5 @@ def patch_aws_sm_env_for_torch_nccl_backend(verbose=True):
     if verbose:
         utils.logger.info(
             "Discovered AWS SageMaker settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
+            .format(os.environ['RANK'], os.environ['LOCAL_RANK'], os.environ['WORLD_SIZE'], os.environ['MASTER_ADDR'],
                     os.environ['MASTER_PORT']))
diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 25839514fcfca9cd86c78600a0b1329edbf886d7..138badebe5a91cf180bc451a4a555756791b2f5a 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from pydantic import BaseModel
 from .constants import *
 
 
 class CommsConfig(BaseModel):
+
     class Config:
         validate_all = True
         validate_assignment = True
@@ -25,6 +25,7 @@ class CommsLoggerConfig(CommsConfig):
 
 
 class DeepSpeedCommsConfig:
+
     def __init__(self, ds_config):
         self.comms_logger_enabled = 'comms_logger' in ds_config
 
diff --git a/deepspeed/comm/constants.py b/deepspeed/comm/constants.py
index 45fd5ac7e3ed867a4504e425bb1322b1aaebd8c6..ab309247befe197f5521f6078a5968c7067ffae3 100644
--- a/deepspeed/comm/constants.py
+++ b/deepspeed/comm/constants.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 NCCL_BACKEND = 'nccl'
 MPI_BACKEND = 'mpi'
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
index 0753e7ce4bb6b63192f4adc9c89c4beb0e71e15e..2b8c8e4315eb121c8ba15e9f7bbd9bb8cd97534a 100644
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -1,12 +1,40 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from deepspeed import utils
 
 from .utils import *
 from .backend import *
 from .comm import *
 
+import torch
+
+
+def is_torch_two():
+    TORCH_MAJOR = int(torch.__version__.split('.')[0])
+    if TORCH_MAJOR >= 2:
+        return True
+    else:
+        return False
+
+
+def has_coalescing_manager():
+    has_c10d = hasattr(torch.distributed, 'distributed_c10d')
+    return has_c10d and hasattr(torch.distributed.distributed_c10d, '_coalescing_manager')
+
+
+def has_all_reduce_coalesced():
+    return hasattr(torch.distributed, "all_reduce_coalesced")
+
+
+def get_coalescing_manager(group, device, reqs):
+    if is_torch_two():
+        return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
+    else:
+        return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
+
 
 class TorchBackend(Backend):
     """
@@ -16,17 +44,13 @@ class TorchBackend(Backend):
         so no need to wrap all the functions. We can keep adding wrappers as
         needed.
     """
-    def __init__(self,
-                 backend,
-                 timeout,
-                 init_method,
-                 rank=-1,
-                 world_size=-1,
-                 name='torch'):
+
+    def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'):
         super(TorchBackend, self).__init__()
-        self.torch_version_before_18 = older_torch()
-        self.has_allgather_base = has_allgather_base()
-        self.has_reduce_scatter_base = has_reduce_scatter_base()
+        self.has_all_reduce_coalesced = has_all_reduce_coalesced()
+        self.has_coalescing_manager = has_coalescing_manager()
+        self.all_gather_function = self.get_all_gather_function()
+        self.reduce_scatter_function = self.get_reduce_scatter_function()
         self.initialized = True
         self.name = name
         # Future functionality to support ds.initialize() on a single GPU
@@ -35,6 +59,28 @@ class TorchBackend(Backend):
         self.single_gpu_mode = True
         self.init_process_group(backend, timeout, init_method, rank, world_size)
 
+    @classmethod
+    def get_all_gather_function(self):
+        if hasattr(torch.distributed, "all_gather_into_tensor"):
+            return torch.distributed.all_gather_into_tensor
+        elif hasattr(torch.distributed, "_all_gather_base"):
+            return torch.distributed._all_gather_base
+        return None
+
+    @classmethod
+    def get_reduce_scatter_function(self):
+        if hasattr(torch.distributed, "reduce_scatter_tensor"):
+            return torch.distributed.reduce_scatter_tensor
+        elif hasattr(torch.distributed, "_reduce_scatter_base"):
+            return torch.distributed._reduce_scatter_base
+        return None
+
+    def has_all_gather_into_tensor(self):
+        return self.all_gather_function is not None
+
+    def has_reduce_scatter_tensor(self):
+        return self.reduce_scatter_function is not None
+
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
             torch.distributed.init_process_group(backend,
@@ -44,30 +90,24 @@ class TorchBackend(Backend):
                                                  world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
-    def all_reduce(self,
-                   tensor,
-                   op=torch.distributed.ReduceOp.SUM,
-                   group=None,
-                   async_op=False):
+    def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
-        return torch.distributed.all_reduce(tensor=tensor,
-                                            op=op,
-                                            group=group,
-                                            async_op=async_op)
+        return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
+        """ proxy func to torch.distributed.all_reduce_coalesced,
+        which is included in PyTorch 1.13 and above
+        """
+        if not self.has_all_reduce_coalesced:
+            raise RuntimeError(f"Current torch version does not have all_reduce_coalesced "
+                               f"api (torch.__version__: {torch.__version__})")
+        op = self._reduce_op(op)
+        return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
-        return torch.distributed.reduce(tensor=tensor,
-                                        dst=dst,
-                                        op=self._reduce_op(op),
-                                        group=group,
-                                        async_op=async_op)
+        return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
-    def reduce_scatter(self,
-                       output,
-                       input_list,
-                       op=ReduceOp.SUM,
-                       group=None,
-                       async_op=False):
+    def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
         return torch.distributed.reduce_scatter(output=output,
                                                 input_list=input_list,
                                                 op=self._reduce_op(op),
@@ -75,48 +115,57 @@ class TorchBackend(Backend):
                                                 async_op=async_op)
 
     def broadcast(self, tensor, src, group=None, async_op=False):
-        return torch.distributed.broadcast(tensor=tensor,
-                                           src=src,
-                                           group=group,
-                                           async_op=async_op)
+        return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
-        return torch.distributed.all_gather(tensor_list=tensor_list,
-                                            tensor=tensor,
+        return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
+
+    def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
+        if self.has_all_gather_into_tensor():
+            return self.all_gather_function(output_tensor=output_tensor,
+                                            input_tensor=input_tensor,
                                             group=group,
                                             async_op=async_op)
-
-    def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
-        if self.has_allgather_base:
-            return torch.distributed.distributed_c10d._all_gather_base(
-                output_tensor=output_tensor,
-                input_tensor=input_tensor,
-                group=group,
-                async_op=async_op)
         else:
-            utils.logger.warning(
-                "unable to find torch.distributed._all_gather_base. will fall back to "
-                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
+            utils.logger.warning("unable to find torch.distributed._all_gather_base. will fall back to "
+                                 "torch.distributed.all_gather which will result in suboptimal performance. "
+                                 "please consider upgrading your pytorch installation.")
             pass
 
-    def reduce_scatter_base(self,
-                            output_tensor,
-                            input_tensor,
-                            op=ReduceOp.SUM,
-                            group=None,
-                            async_op=False):
-        if self.has_reduce_scatter_base:
-            return torch.distributed._reduce_scatter_base(output_tensor,
-                                                          input_tensor,
-                                                          op=self._reduce_op(op),
-                                                          group=group,
-                                                          async_op=async_op)
+    def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
+        """"""
+        assert len(output_tensors) == len(input_tensors), ""
+        if hasattr(torch.distributed.distributed_c10d, '_all_gather_base_coalesced'):
+            # customized PyTorch
+            return torch.distributed.distributed_c10d._all_gather_base_coalesced(output_tensors,
+                                                                                 input_tensors,
+                                                                                 group=group,
+                                                                                 async_op=async_op)
+        elif has_coalescing_manager():
+            reqs = []
+            with get_coalescing_manager(group, input_tensors[0].device, reqs):
+                for output, input in zip(output_tensors, input_tensors):
+                    handle = torch.distributed.distributed_c10d.all_gather_into_tensor(output,
+                                                                                       input,
+                                                                                       group=group,
+                                                                                       async_op=True)
+                    reqs.append(handle)
+            if async_op:
+                return reqs[-1]
+            else:
+                reqs[-1].wait()
+
+    def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
+        if self.has_reduce_scatter_tensor():
+            return self.reduce_scatter_function(output_tensor,
+                                                input_tensor,
+                                                op=self._reduce_op(op),
+                                                group=group,
+                                                async_op=async_op)
         else:
-            utils.logger.warning(
-                "unable to find torch.distributed._reduce_scatter_base. will fall back to "
-                "torch.distributed.reduce_scatter which will result in suboptimal performance. "
-                "please consider upgrading your pytorch installation.")
+            utils.logger.warning("unable to find torch.distributed.reduce_scatter_tensor. will fall back to "
+                                 "torch.distributed.reduce_scatter which will result in suboptimal performance. "
+                                 "please consider upgrading your pytorch installation.")
             pass
 
     def all_to_all_single(self,
@@ -159,25 +208,15 @@ class TorchBackend(Backend):
                                          group=group,
                                          async_op=async_op)
 
-    def barrier(self,
-                group=torch.distributed.GroupMember.WORLD,
-                async_op=False,
-                device_ids=None):
+    def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
-        return torch.distributed.barrier(group=group,
-                                         async_op=async_op,
-                                         device_ids=device_ids)
-
-    def monitored_barrier(self,
-                          group=torch.distributed.GroupMember.WORLD,
-                          timeout=None,
-                          wait_all_ranks=False):
+        return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
+
+    def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
-        return torch.distributed.monitored_barrier(group=group,
-                                                   timeout=timeout,
-                                                   wait_all_ranks=wait_all_ranks)
+        return torch.distributed.monitored_barrier(group=group, timeout=timeout, wait_all_ranks=wait_all_ranks)
 
     def get_rank(self, group=None):
         return torch.distributed.get_rank(group=group)
diff --git a/deepspeed/comm/utils.py b/deepspeed/comm/utils.py
index 120838e72eedbe4857e648d8ed7ae1e96a970e52..27a4d2c4a588fda1e88f9e82b16ae5315ec323b4 100644
--- a/deepspeed/comm/utils.py
+++ b/deepspeed/comm/utils.py
@@ -1,39 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
-import torch
 import inspect
 from deepspeed.utils import get_caller_func
 
 
-def older_torch():
-    '''
-        Helper to lookup torch version. For versions less than 1.8, torch.dist
-        used torch.distributed.group.WORLD as the default group argument instead of None.
-        See more details at: https://github.com/pytorch/pytorch/pull/48767
-    '''
-    TORCH_MAJOR = int(torch.__version__.split('.')[0])
-    TORCH_MINOR = int(torch.__version__.split('.')[1])
-    if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
-        return True
-    else:
-        return False
-
-
-def has_allgather_base():
-    '''
-        Helper to check if torch.distributed has _all_gather_base
-    '''
-    return hasattr(torch.distributed, "_all_gather_base")
-
-
-def has_reduce_scatter_base():
-    '''
-        Helper to check if torch.distributed has _reduce_scatter_base
-    '''
-    return hasattr(torch.distributed, "_reduce_scatter_base")
-
-
 def get_local_rank_from_launcher():
 
     # DeepSpeed launcher will set it so get from there
@@ -84,11 +58,7 @@ def get_world_size_from_launcher():
 
 def get_default_args(func):
     signature = inspect.signature(func)
-    return {
-        k: v.default
-        for k,
-        v in signature.parameters.items() if v.default is not inspect.Parameter.empty
-    }
+    return {k: v.default for k, v in signature.parameters.items() if v.default is not inspect.Parameter.empty}
 
 
 # We need this hacky function since torch doesn't consistently name or place the input tensor args
diff --git a/deepspeed/compression/__init__.py b/deepspeed/compression/__init__.py
index 8e4974e69819c53f443a2bf348ccfa6619ddb5ad..8c7e1c89387326f7c043f95ff0b2153c4b9f21fe 100644
--- a/deepspeed/compression/__init__.py
+++ b/deepspeed/compression/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .compress import init_compression, redundancy_clean
 from .scheduler import compression_scheduler
diff --git a/deepspeed/compression/basic_layer.py b/deepspeed/compression/basic_layer.py
index a15f80da188e18ad98a6e9416f99a348be1ee396..a5b872fa3a65f244900c5c9f5daca0efd194ba3f 100644
--- a/deepspeed/compression/basic_layer.py
+++ b/deepspeed/compression/basic_layer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import math
@@ -21,6 +24,7 @@ class QuantAct(nn.Module):
         Momentum for updating the activation quantization range.
     quant_mode : str, default 'symmetric'
     """
+
     def __init__(self, act_range_momentum=0.95, quant_mode='symmetric'):
         super(QuantAct, self).__init__()
 
@@ -50,10 +54,8 @@ class QuantAct(nn.Module):
                 self.x_min_max[1] = x_max
 
             # if do not need momentum, please set self.act_range_momentum = 0
-            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (
-                1 - self.act_range_momentum)
-            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (
-                1 - self.act_range_momentum)
+            self.x_min_max[0] = self.x_min_max[0] * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
+            self.x_min_max[1] = self.x_min_max[1] * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
 
         x_q = self.act_function(x, num_bits, self.x_min_max[0], self.x_min_max[1])
 
@@ -61,6 +63,7 @@ class QuantAct(nn.Module):
 
 
 class Embedding_Compress(nn.Embedding):
+
     def __init__(self, *kargs):
         super(Embedding_Compress, self).__init__(*kargs)
         self.weight.start_bits = None
@@ -71,17 +74,10 @@ class Embedding_Compress(nn.Embedding):
 
     def extra_repr(self):
         return 'num_embeddings={}, embedding_dim={}, weight_quantization={}'.format(
-            self.num_embeddings,
-            self.embedding_dim,
-            self.weight.target_bits)
-
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+            self.num_embeddings, self.embedding_dim, self.weight.target_bits)
+
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -105,31 +101,20 @@ class Embedding_Compress(nn.Embedding):
             self.weight_quantize_num_groups = self.weight.size(0)
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
 
     def forward(self, input):
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
         else:
             weight = self.weight
 
-        out = nn.functional.embedding(input,
-                                      weight,
-                                      self.padding_idx,
-                                      self.max_norm,
-                                      self.norm_type,
-                                      self.scale_grad_by_freq,
-                                      self.sparse)
+        out = nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type,
+                                      self.scale_grad_by_freq, self.sparse)
         return out
 
 
@@ -137,6 +122,7 @@ class LinearLayer_Compress(nn.Linear):
     """
     Linear layer with compression.
     """
+
     def __init__(self, *kargs, bias=True):
         super(LinearLayer_Compress, self).__init__(*kargs, bias=bias)
         self.sparse_pruning_method = None
@@ -169,8 +155,7 @@ class LinearLayer_Compress(nn.Linear):
             mask = mask.to(self.weight.device)
         elif method == 'topk':
             self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
-            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
-                self.weight.device)
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -209,11 +194,9 @@ class LinearLayer_Compress(nn.Linear):
             raise NotImplementedError
         else:
             self.head_pruning_ratio = ratio
-            self.head_pruning_scores = nn.Parameter(torch.Tensor(
-                1,
-                self.num_heads))  # we apply the pruning to O matrix
-            self.head_pruning_scores.data = self.head_pruning_scores.data.to(
-                self.weight.device)
+            self.head_pruning_scores = nn.Parameter(torch.Tensor(1,
+                                                                 self.num_heads))  # we apply the pruning to O matrix
+            self.head_pruning_scores.data = self.head_pruning_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.head_pruning_scores, a=math.sqrt(5))
 
     def fix_sparse_pruning_helper(self):
@@ -279,18 +262,17 @@ class LinearLayer_Compress(nn.Linear):
                     start_bits = self.weight.start_bits
                     target_bits = self.weight.target_bits
                     q_period = self.weight.q_period
-                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads, -1)[mask.view(-1), :].reshape(-1, shape).t())
+                    self.weight = nn.Parameter(self.weight.data.t().reshape(num_heads,
+                                                                            -1)[mask.view(-1), :].reshape(-1,
+                                                                                                          shape).t())
                     self.weight.start_bits = start_bits
                     self.weight.target_bits = target_bits
                     self.weight.q_period = q_period
                 else:
 
                     shape = self.weight.size()
-                    self.weight.data = (self.weight.data.t().reshape(self.num_heads,
-                                                                     -1) *
-                                        mask.view(-1,
-                                                  1)).reshape(shape[1],
-                                                              shape[0]).t()
+                    self.weight.data = (self.weight.data.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape(
+                        shape[1], shape[0]).t()
 
                 if self.head_pruning_method == 'topk':
                     del self.head_pruning_scores
@@ -316,37 +298,26 @@ class LinearLayer_Compress(nn.Linear):
             if self.sparse_pruning_method == 'l1':
                 return self.sparse_pruning_mask.to(self.weight.device)
             elif self.sparse_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.sparse_mask_scores,
-                                           self.sparse_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False)
             else:
                 raise NotImplementedError
         if pruning_type == 'row':
             if self.row_pruning_method == 'l1':
                 return self.row_pruning_mask.to(self.weight.device)
             elif self.row_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.row_mask_scores,
-                                           self.row_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.row_mask_scores, self.row_pruning_ratio, False)
             else:
                 raise NotImplementedError
         elif pruning_type == 'head':
             if self.head_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.head_pruning_scores,
-                                           self.head_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.head_pruning_scores, self.head_pruning_ratio, False)
             else:
                 raise NotImplementedError
         else:
             raise NotImplementedError
 
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -369,10 +340,7 @@ class LinearLayer_Compress(nn.Linear):
             self.weight_quantize_num_groups = num_groups
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
@@ -391,18 +359,12 @@ class LinearLayer_Compress(nn.Linear):
 
     def head_pruning_reshape(self, w, mask):
         shape = w.shape
-        return (w.t().reshape(self.num_heads,
-                              -1) * mask.view(-1,
-                                              1)).reshape(shape[1],
-                                                          shape[0]).t()
+        return (w.t().reshape(self.num_heads, -1) * mask.view(-1, 1)).reshape(shape[1], shape[0]).t()
 
     def forward(self, input, skip_bias_add=False):
 
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
             bias = self.bias
         else:
@@ -428,11 +390,7 @@ class LinearLayer_Compress(nn.Linear):
                 num_groups = input.numel() // input.size(-1)
             else:
                 num_groups = 1
-            input = self.activation_quantizer(input,
-                                              self.activation_quantization_bits,
-                                              None,
-                                              None,
-                                              num_groups)
+            input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups)
 
         if skip_bias_add:
             # used for mpu linear layers
@@ -447,6 +405,7 @@ class Conv2dLayer_Compress(nn.Conv2d):
     """
     Conv2D layer with compression.
     """
+
     def __init__(self, *kargs):
         super(Conv2dLayer_Compress, self).__init__(*kargs)
         self.sparse_pruning_method = None
@@ -478,10 +437,8 @@ class Conv2dLayer_Compress(nn.Conv2d):
         output = s.format(**self.__dict__)
 
         return output + ' sparse pruning={}, channel pruning={}, activation quantization={}, weight_quantization={}'.format(
-            self.sparse_pruning_method is not None,
-            self.channel_pruning_method is not None,
-            self.activation_quantization_method is not None,
-            self.weight.target_bits)
+            self.sparse_pruning_method is not None, self.channel_pruning_method is not None,
+            self.activation_quantization_method is not None, self.weight.target_bits)
 
     def enable_sparse_pruning(self, ratio, method):
         self.sparse_pruning_ratio = ratio
@@ -493,8 +450,7 @@ class Conv2dLayer_Compress(nn.Conv2d):
             mask = mask.to(self.weight.device)
         elif method == 'topk':
             self.sparse_mask_scores = nn.Parameter(torch.Tensor(self.weight.size()))
-            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(
-                self.weight.device)
+            self.sparse_mask_scores.data = self.sparse_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.sparse_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -514,13 +470,8 @@ class Conv2dLayer_Compress(nn.Conv2d):
             mask = mask.view(-1, 1, 1, 1)
             mask = mask.to(self.weight.device)
         elif method == 'topk':
-            self.channel_mask_scores = nn.Parameter(
-                torch.Tensor(self.weight.size(0),
-                             1,
-                             1,
-                             1))
-            self.channel_mask_scores.data = self.channel_mask_scores.data.to(
-                self.weight.device)
+            self.channel_mask_scores = nn.Parameter(torch.Tensor(self.weight.size(0), 1, 1, 1))
+            self.channel_mask_scores.data = self.channel_mask_scores.data.to(self.weight.device)
             init.kaiming_uniform_(self.channel_mask_scores, a=math.sqrt(5))
             mask = None
         else:
@@ -579,39 +530,27 @@ class Conv2dLayer_Compress(nn.Conv2d):
             if self.sparse_pruning_method == 'l1':
                 return self.sparse_pruning_mask.to(self.weight.device)
             elif self.sparse_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.sparse_mask_scores,
-                                           self.sparse_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.sparse_mask_scores, self.sparse_pruning_ratio, False)
             else:
                 raise NotImplementedError
         elif pruning_type == 'channel':
             if self.channel_pruning_method == 'l1':
                 return self.channel_pruning_mask.to(self.weight.device)
             elif self.channel_pruning_method == 'topk':
-                return TopKBinarizer.apply(self.channel_mask_scores,
-                                           self.channel_pruning_ratio,
-                                           False)
+                return TopKBinarizer.apply(self.channel_mask_scores, self.channel_pruning_ratio, False)
             else:
                 raise NotImplementedError
         else:
             raise NotImplementedError
 
     def fix_weight_quantization(self):
-        self.weight.data = self.weight_quantizer(self.weight,
-                                                 self.weight.target_bits,
-                                                 None,
-                                                 None,
+        self.weight.data = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                                  self.weight_quantize_num_groups).data
         self.weight_quantization_enabled_in_forward = False
         return None
 
-    def enable_weight_quantization(self,
-                                   start_bits,
-                                   target_bits,
-                                   quantization_period,
-                                   weight_quantization_enabled_in_forward,
-                                   quantization_type,
-                                   num_groups):
+    def enable_weight_quantization(self, start_bits, target_bits, quantization_period,
+                                   weight_quantization_enabled_in_forward, quantization_type, num_groups):
         self.weight.start_bits = start_bits
         self.weight.target_bits = target_bits
         self.weight.q_period = quantization_period
@@ -642,10 +581,7 @@ class Conv2dLayer_Compress(nn.Conv2d):
     def forward(self, input):
 
         if self.weight_quantization_enabled_in_forward and self.weight_quantization_enabled:
-            weight = self.weight_quantizer(self.weight,
-                                           self.weight.target_bits,
-                                           None,
-                                           None,
+            weight = self.weight_quantizer(self.weight, self.weight.target_bits, None, None,
                                            self.weight_quantize_num_groups)
             bias = self.bias
         else:
@@ -667,22 +603,13 @@ class Conv2dLayer_Compress(nn.Conv2d):
                 num_groups = input.numel() // input[0].numel()
             else:
                 num_groups = 1
-            input = self.activation_quantizer(input,
-                                              self.activation_quantization_bits,
-                                              None,
-                                              None,
-                                              num_groups)
+            input = self.activation_quantizer(input, self.activation_quantization_bits, None, None, num_groups)
 
-        return nn.functional.conv2d(input,
-                                    weight,
-                                    bias,
-                                    self.stride,
-                                    self.padding,
-                                    self.dilation,
-                                    self.groups)
+        return nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
 
 
 class BNLayer_Compress(nn.BatchNorm2d):
+
     def fix_channel_pruning_helper(self, mask, dim_reduction=True):
         self.weight = nn.Parameter(self.weight.data[mask.view(-1)])
         self.bias = nn.Parameter(self.bias.data[mask.view(-1)])
@@ -770,6 +697,7 @@ def _gather(input_):
 
 class _CopyToModelParallelRegion(torch.autograd.Function):
     """Pass the input to the model parallel region."""
+
     @staticmethod
     def forward(ctx, input_):
         return input_
@@ -781,6 +709,7 @@ class _CopyToModelParallelRegion(torch.autograd.Function):
 
 class _ReduceFromModelParallelRegion(torch.autograd.Function):
     """All-redcue the input from the model parallel region."""
+
     @staticmethod
     def forward(ctx, input_):
         return _reduce(input_)
@@ -792,6 +721,7 @@ class _ReduceFromModelParallelRegion(torch.autograd.Function):
 
 class _ScatterToModelParallelRegion(torch.autograd.Function):
     """Split the input and keep only the corresponding chuck to the rank."""
+
     @staticmethod
     def forward(ctx, input_):
         return _split(input_)
@@ -803,6 +733,7 @@ class _ScatterToModelParallelRegion(torch.autograd.Function):
 
 class _GatherFromModelParallelRegion(torch.autograd.Function):
     """Gather the input from model parallel region and concatinate."""
+
     @staticmethod
     def forward(ctx, input_):
         return _gather(input_)
@@ -834,13 +765,8 @@ def gather_from_model_parallel_region(input_):
 
 
 class ColumnParallelLinear_Compress(LinearLayer_Compress):
-    def __init__(self,
-                 mpu,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 gather_output=True,
-                 skip_bias_add=False):
+
+    def __init__(self, mpu, input_size, output_size, bias=True, gather_output=True, skip_bias_add=False):
         # Keep input parameters
         global g_mpu
         g_mpu = mpu
@@ -854,10 +780,7 @@ class ColumnParallelLinear_Compress(LinearLayer_Compress):
         assert output_size % world_size == 0
         self.output_size_per_partition = output_size // world_size
 
-        super(ColumnParallelLinear_Compress,
-              self).__init__(self.input_size,
-                             self.output_size_per_partition,
-                             bias=bias)
+        super(ColumnParallelLinear_Compress, self).__init__(self.input_size, self.output_size_per_partition, bias=bias)
 
     def forward(self, input_):
         # Set up backprop all-reduce.
@@ -877,13 +800,8 @@ class ColumnParallelLinear_Compress(LinearLayer_Compress):
 
 
 class RowParallelLinear_Compress(LinearLayer_Compress):
-    def __init__(self,
-                 mpu,
-                 input_size,
-                 output_size,
-                 bias=True,
-                 input_is_parallel=False,
-                 skip_bias_add=False):
+
+    def __init__(self, mpu, input_size, output_size, bias=True, input_is_parallel=False, skip_bias_add=False):
         # Keep input parameters
         global g_mpu
         g_mpu = mpu
@@ -897,10 +815,7 @@ class RowParallelLinear_Compress(LinearLayer_Compress):
         assert input_size % world_size == 0
         self.input_size_per_partition = input_size // world_size
 
-        super(RowParallelLinear_Compress,
-              self).__init__(self.input_size_per_partition,
-                             self.output_size,
-                             bias=bias)
+        super(RowParallelLinear_Compress, self).__init__(self.input_size_per_partition, self.output_size, bias=bias)
 
     def forward(self, input_):
         # Set up backprop all-reduce.
diff --git a/deepspeed/compression/compress.py b/deepspeed/compression/compress.py
index bf3b6c2760fa5b31b737ef62eea162e5418d4d74..37d98d9496fd213cea72dd23fe26760a26d94a50 100644
--- a/deepspeed/compression/compress.py
+++ b/deepspeed/compression/compress.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import re
 from .helper import compression_preparation, fix_compression, recursive_getattr, is_module_compressible
@@ -13,21 +16,13 @@ def check_deepspeed_config(config):
     if isinstance(config, dict):
         return config
     elif os.path.exists(config):
-        return json.load(open(config,
-                              "r"),
-                         object_pairs_hook=dict_raise_error_on_duplicate_keys)
+        return json.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
     else:
         raise ValueError(
-            f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}"
-        )
+            f"Expected a string path to an existing deepspeed config, or a dictionary. Received: {config}")
 
 
-def get_module_name(group_name,
-                    model,
-                    key_word,
-                    exist_module_name,
-                    mpu=None,
-                    verbose=True):
+def get_module_name(group_name, model, key_word, exist_module_name, mpu=None, verbose=True):
     '''
     get the associated module name from the model based on the key_word provided by users
     '''
@@ -40,8 +35,7 @@ def get_module_name(group_name,
             if name in exist_module_name and verbose:
                 # logger.warning
                 raise ValueError(
-                    f"{name} is already added to compression, please check your config file for {group_name}."
-                )
+                    f"{name} is already added to compression, please check your config file for {group_name}.")
             if name not in exist_module_name:
                 exist_module_name.add(name)
                 return_module_name.append(name)
@@ -56,8 +50,7 @@ def get_compress_methods(model, compress_methods, mpu=None):
             continue
         # for loop different methods, i.e., weight quantization, activation quantization etc
         exist_module_name = set()
-        shared_parameters = method_content[
-            SHARED_PARAMETERS]  # get all the shared parameters
+        shared_parameters = method_content[SHARED_PARAMETERS]  # get all the shared parameters
         for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
             # for loop different groups, i.e., weight quantization group 1, weight quantization group 2 etc
             module_name_list = []
@@ -65,8 +58,13 @@ def get_compress_methods(model, compress_methods, mpu=None):
             if method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]:
                 # this is used for head/row/channel pruning, if users provide the related module scope, we can shrink the layer dim for them
                 # otherwise we just mask those as zeros
-                for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE], method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]):
-                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                for key_word, related_key_words in zip(method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE],
+                                                       method_parameters[DIFFERENT_GROUPS_RELATED_MODULE_SCOPE]):
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     mpu=mpu)
                     module_name_list.append(module_name)
                     tmp_related_module_name_list = []
                     for rkw in related_key_words:
@@ -76,7 +74,11 @@ def get_compress_methods(model, compress_methods, mpu=None):
                     related_module_name_list.append(tmp_related_module_name_list)
             else:
                 for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
-                    module_name, exist_module_name = get_module_name(group_name, model, key_word, exist_module_name, mpu=mpu)
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     mpu=mpu)
                     module_name_list.append(module_name)
 
             if module_name_list:
@@ -85,13 +87,7 @@ def get_compress_methods(model, compress_methods, mpu=None):
                     **(method_parameters.copy().pop(DIFFERENT_GROUPS_PARAMETERS)),
                     **shared_parameters
                 }
-                compression_item = [
-                    module_name_list,
-                    related_module_name_list,
-                    {
-                        method: combined_method_parameters
-                    }
-                ]
+                compression_item = [module_name_list, related_module_name_list, {method: combined_method_parameters}]
                 layer_added_compress_methods.append(compression_item)
     return layer_added_compress_methods
 
@@ -118,9 +114,7 @@ def init_compression(model, deepspeed_config, teacher_model=None, mpu=None):
         assert teacher_model is not None, "Teacher model is required for layer reduction"
         student_initialization(c_model, teacher_model, deepspeed_config)
 
-    layer_added_compress_methods = get_compress_methods(c_model,
-                                                        compress_methods,
-                                                        mpu=mpu)
+    layer_added_compress_methods = get_compress_methods(c_model, compress_methods, mpu=mpu)
     compression_preparation(c_model, layer_added_compress_methods, mpu)
 
     return model
@@ -143,31 +137,20 @@ def redundancy_clean(model, deepspeed_config, mpu=None):
     else:
         c_model = model
 
-    layer_added_compress_methods_tmp = get_compress_methods(c_model,
-                                                            compress_methods,
-                                                            mpu=mpu)
+    layer_added_compress_methods_tmp = get_compress_methods(c_model, compress_methods, mpu=mpu)
     # sort methods
     order_list = [
-        WEIGHT_QUANTIZATION,
-        SPARSE_PRUNING,
-        ROW_PRUNING,
-        HEAD_PRUNING,
-        CHANNEL_PRUNING,
-        ACTIVATION_QUANTIZATION
+        WEIGHT_QUANTIZATION, SPARSE_PRUNING, ROW_PRUNING, HEAD_PRUNING, CHANNEL_PRUNING, ACTIVATION_QUANTIZATION
     ]
-    layer_added_compress_methods = sorted(
-        layer_added_compress_methods_tmp,
-        key=lambda x: order_list.index(list(x[2].keys())[0]))
+    layer_added_compress_methods = sorted(layer_added_compress_methods_tmp,
+                                          key=lambda x: order_list.index(list(x[2].keys())[0]))
 
     for module_name_lists, related_module_name_lists, compression_technique in layer_added_compress_methods:
         stored_mask = []
         need_mask = True if related_module_name_lists else False
         for i, mnl in enumerate(module_name_lists):
             for module_name in mnl:
-                mask = fix_compression(c_model,
-                                       module_name,
-                                       compression_technique,
-                                       dim_reduction=need_mask)
+                mask = fix_compression(c_model, module_name, compression_technique, dim_reduction=need_mask)
                 if need_mask:
                     stored_mask.append(mask)
             if need_mask:
@@ -219,10 +202,8 @@ def student_initialization(student_model, teacher_model, deepspeed_config):
     '''
     assert len(student_layer) == len(teacher_layer)
     for s_name, t_name in zip(student_layer, teacher_layer):
-        s_module = recursive_getattr(student_model,
-                                     module_name_prefix + '.' + str(s_name))
-        t_module = recursive_getattr(teacher_model,
-                                     module_name_prefix + '.' + str(t_name))
+        s_module = recursive_getattr(student_model, module_name_prefix + '.' + str(s_name))
+        t_module = recursive_getattr(teacher_model, module_name_prefix + '.' + str(t_name))
         for s_param, t_param in zip(s_module.parameters(), t_module.parameters()):
             s_param.data.copy_(t_param.data)
     for name in other_module_name:
diff --git a/deepspeed/compression/config.py b/deepspeed/compression/config.py
index e6a710dfa3ea5d6d0ea56b372233fb749f7ca1e8..d6e241bd0f804067c379541c470b480e0c4cdb65 100644
--- a/deepspeed/compression/config.py
+++ b/deepspeed/compression/config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .constants import *
 import copy
@@ -36,9 +39,7 @@ def get_layer_reduction(param_dict):
 
 def get_layer_reduction_enabled(param_dict):
     if LAYER_REDUCTION in param_dict.keys():
-        return get_scalar_param(param_dict[LAYER_REDUCTION],
-                                LAYER_REDUCTION_ENABLED,
-                                LAYER_REDUCTION_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[LAYER_REDUCTION], LAYER_REDUCTION_ENABLED, LAYER_REDUCTION_ENABLED_DEFAULT)
     else:
         return False
 
@@ -70,7 +71,8 @@ def get_weight_quantization(param_dict):
     output[SHARED_PARAMETERS] = get_weight_quantization_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Weigh Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_weight_quantization_different_groups(sub_param_dict)
     return output
 
@@ -79,51 +81,38 @@ def get_weight_quantization_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_ENABLED,
-            WEIGHT_QUANTIZE_ENABLED_DEFAULT)
-        output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_KERNEL,
-            WEIGHT_QUANTIZE_KERNEL_DEFAULT)
-        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_SCHEDULE_OFFSET,
-            WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
-        output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_GROUPS,
-            WEIGHT_QUANTIZE_GROUPS_DEFAULT)
-        output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_VERBOSE,
-            WEIGHT_QUANTIZE_VERBOSE_DEFAULT)
-        output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict,
-                                                        WEIGHT_QUANTIZE_TYPE,
+        output[WEIGHT_QUANTIZE_ENABLED] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ENABLED,
+                                                           WEIGHT_QUANTIZE_ENABLED_DEFAULT)
+        output[WEIGHT_QUANTIZE_KERNEL] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_KERNEL,
+                                                          WEIGHT_QUANTIZE_KERNEL_DEFAULT)
+        output[WEIGHT_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_SCHEDULE_OFFSET,
+                                                                   WEIGHT_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+        output[WEIGHT_QUANTIZE_GROUPS] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_GROUPS,
+                                                          WEIGHT_QUANTIZE_GROUPS_DEFAULT)
+        output[WEIGHT_QUANTIZE_VERBOSE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_VERBOSE,
+                                                           WEIGHT_QUANTIZE_VERBOSE_DEFAULT)
+        output[WEIGHT_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_TYPE,
                                                         WEIGHT_QUANTIZE_TYPE_DEFAULT)
-        output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED,
-            WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT)
-        assert output[WEIGHT_QUANTIZE_TYPE] in [WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]"
-        output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(
-            sub_param_dict,
-            WEIGHT_QUANTIZE_ROUNDING,
-            WEIGHT_QUANTIZE_ROUNDING_DEFAULT)
-        assert output[WEIGHT_QUANTIZE_ROUNDING] in [WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]"
+        output[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] = get_scalar_param(sub_param_dict,
+                                                                      WEIGHT_QUANTIZE_IN_FORWARD_ENABLED,
+                                                                      WEIGHT_QUANTIZE_IN_FORWARD_ENABLED_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_TYPE] in [
+            WEIGHT_QUANTIZE_SYMMETRIC, WEIGHT_QUANTIZE_ASYMMETRIC
+        ], f"Invalid weight quantize type. Supported types: [{WEIGHT_QUANTIZE_SYMMETRIC}, {WEIGHT_QUANTIZE_ASYMMETRIC}]"
+        output[WEIGHT_QUANTIZE_ROUNDING] = get_scalar_param(sub_param_dict, WEIGHT_QUANTIZE_ROUNDING,
+                                                            WEIGHT_QUANTIZE_ROUNDING_DEFAULT)
+        assert output[WEIGHT_QUANTIZE_ROUNDING] in [
+            WEIGHT_QUANTIZE_NEAREST_ROUNDING, WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING
+        ], f"Invalid weight quantize rounding. Supported types: [{WEIGHT_QUANTIZE_NEAREST_ROUNDING}, {WEIGHT_QUANTIZE_STOCHASTIC_ROUNDING}]"
         if WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE in sub_param_dict.keys():
             output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = get_scalar_param(
-                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED,
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED,
                 WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT)
             output[WEIGHT_QUANTIZE_CHANGE_RATIO] = get_scalar_param(
-                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-                WEIGHT_QUANTIZE_CHANGE_RATIO,
+                sub_param_dict[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE], WEIGHT_QUANTIZE_CHANGE_RATIO,
                 WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT)
         else:
-            output[
-                WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+            output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
             output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
     else:
         output[WEIGHT_QUANTIZE_ENABLED] = WEIGHT_QUANTIZE_ENABLED_DEFAULT
@@ -133,8 +122,7 @@ def get_weight_quantization_shared_parameters(param_dict):
         output[WEIGHT_QUANTIZE_VERBOSE] = WEIGHT_QUANTIZE_VERBOSE_DEFAULT
         output[WEIGHT_QUANTIZE_TYPE] = WEIGHT_QUANTIZE_TYPE_DEFAULT
         output[WEIGHT_QUANTIZE_ROUNDING] = WEIGHT_QUANTIZE_ROUNDING_DEFAULT
-        output[
-            WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
+        output[WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE] = WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE_ENABLED_DEFAULT
         output[WEIGHT_QUANTIZE_CHANGE_RATIO] = WEIGHT_QUANTIZE_CHANGE_RATIO_DEFAULT
     return output
 
@@ -144,27 +132,21 @@ def get_weight_quantization_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}"
-        assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}"
-        group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(
-            group_dict,
-            WEIGHT_QUANTIZATION_PERIOD,
-            WEIGHT_QUANTIZATION_PERIOD_DEFAULT)
+        assert WEIGHT_QUANTIZE_START_BITS in group_dict.keys(
+        ), f"{WEIGHT_QUANTIZE_START_BITS} must be specified for weight quantization group {name}"
+        assert WEIGHT_QUANTIZE_TARGET_BITS in group_dict.keys(
+        ), f"{WEIGHT_QUANTIZE_TARGET_BITS} must be specified for weight quantization group {name}"
+        group_dict[WEIGHT_QUANTIZATION_PERIOD] = get_scalar_param(group_dict, WEIGHT_QUANTIZATION_PERIOD,
+                                                                  WEIGHT_QUANTIZATION_PERIOD_DEFAULT)
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -172,19 +154,15 @@ def get_weight_quantization_different_groups(param_dict):
 def get_activation_quantization(param_dict):
     output = {}
     if ACTIVATION_QUANTIZATION not in param_dict.keys():
-        param_dict[ACTIVATION_QUANTIZATION] = {
-            SHARED_PARAMETERS: {},
-            DIFFERENT_GROUPS: {}
-        }
+        param_dict[ACTIVATION_QUANTIZATION] = {SHARED_PARAMETERS: {}, DIFFERENT_GROUPS: {}}
     sub_param_dict = param_dict[ACTIVATION_QUANTIZATION]
     # shared parameters
-    output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(
-        sub_param_dict)
+    output[SHARED_PARAMETERS] = get_activation_quantization_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][ACTIVATION_QUANTIZATION_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
-    output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(
-        sub_param_dict)
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Activation Quantization is enabled, {DIFFERENT_GROUPS} must be specified"
+    output[DIFFERENT_GROUPS] = get_activation_quantization_different_groups(sub_param_dict)
     return output
 
 
@@ -192,30 +170,26 @@ def get_activation_quantization_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZATION_ENABLED,
-            ACTIVATION_QUANTIZATION_ENABLED_DEFAULT)
-        output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_TYPE,
-            ACTIVATION_QUANTIZE_TYPE_DEFAULT)
-        assert output[ACTIVATION_QUANTIZE_TYPE] in [ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]"
-        output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_RANGE,
-            ACTIVATION_QUANTIZE_RANGE_DEFAULT)
-        assert output[ACTIVATION_QUANTIZE_RANGE] in [ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]"
-        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET,
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
+        output[ACTIVATION_QUANTIZATION_ENABLED] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZATION_ENABLED,
+                                                                   ACTIVATION_QUANTIZATION_ENABLED_DEFAULT)
+        output[ACTIVATION_QUANTIZE_TYPE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_TYPE,
+                                                            ACTIVATION_QUANTIZE_TYPE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_TYPE] in [
+            ACTIVATION_QUANTIZE_SYMMETRIC, ACTIVATION_QUANTIZE_ASYMMETRIC
+        ], f"Invalid activation quantize type. Supported types: [{ACTIVATION_QUANTIZE_SYMMETRIC}, {ACTIVATION_QUANTIZE_ASYMMETRIC}]"
+        output[ACTIVATION_QUANTIZE_RANGE] = get_scalar_param(sub_param_dict, ACTIVATION_QUANTIZE_RANGE,
+                                                             ACTIVATION_QUANTIZE_RANGE_DEFAULT)
+        assert output[ACTIVATION_QUANTIZE_RANGE] in [
+            ACTIVATION_QUANTIZE_RANGE_DYNAMIC, ACTIVATION_QUANTIZE_RANGE_STATIC
+        ], f"Invalid activation quantize range calibration. Supported types: [{ACTIVATION_QUANTIZE_RANGE_DYNAMIC}, {ACTIVATION_QUANTIZE_RANGE_STATIC}]"
+        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict,
+                                                                       ACTIVATION_QUANTIZE_SCHEDULE_OFFSET,
+                                                                       ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[ACTIVATION_QUANTIZATION_ENABLED] = ACTIVATION_QUANTIZATION_ENABLED_DEFAULT
         output[ACTIVATION_QUANTIZE_TYPE] = ACTIVATION_QUANTIZE_TYPE_DEFAULT
         output[ACTIVATION_QUANTIZE_RANGE] = ACTIVATION_QUANTIZE_RANGE_DEFAULT
-        output[
-            ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
+        output[ACTIVATION_QUANTIZE_SCHEDULE_OFFSET] = ACTIVATION_QUANTIZE_SCHEDULE_OFFSET_DEFAULT
     return output
 
 
@@ -224,22 +198,17 @@ def get_activation_quantization_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}"
+        assert ACTIVATION_QUANTIZE_BITS in group_dict.keys(
+        ), f"{ACTIVATION_QUANTIZE_BITS} must be specified for activation quantization group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -253,7 +222,8 @@ def get_sparse_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_sparse_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][SPARSE_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_sparse_pruning_different_groups(sub_param_dict)
     return output
 
@@ -262,18 +232,15 @@ def get_sparse_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[SPARSE_PRUNING_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            SPARSE_PRUNING_ENABLED,
-            SPARSE_PRUNING_ENABLED_DEFAULT)
-        output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                         SPARSE_PRUNING_METHOD,
+        output[SPARSE_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_ENABLED,
+                                                          SPARSE_PRUNING_ENABLED_DEFAULT)
+        output[SPARSE_PRUNING_METHOD] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_METHOD,
                                                          SPARSE_PRUNING_METHOD_DEFAULT)
-        assert output[SPARSE_PRUNING_METHOD] in [SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]"
-        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            SPARSE_PRUNING_SCHEDULE_OFFSET,
-            SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        assert output[SPARSE_PRUNING_METHOD] in [
+            SPARSE_PRUNING_METHOD_L1, SPARSE_PRUNING_METHOD_TOPK
+        ], f"Invalid sparse pruning method. Supported types: [{SPARSE_PRUNING_METHOD_L1}, {SPARSE_PRUNING_METHOD_TOPK}]"
+        output[SPARSE_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, SPARSE_PRUNING_SCHEDULE_OFFSET,
+                                                                  SPARSE_PRUNING_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[SPARSE_PRUNING_ENABLED] = SPARSE_PRUNING_ENABLED_DEFAULT
         output[SPARSE_PRUNING_METHOD] = SPARSE_PRUNING_METHOD_DEFAULT
@@ -286,22 +253,17 @@ def get_sparse_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}"
+        assert SPARSE_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{SPARSE_PRUNING_DENSE_RATIO} must be specified for sparse pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
 
@@ -315,7 +277,8 @@ def get_row_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_row_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][ROW_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Row Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_row_pruning_different_groups(sub_param_dict)
     return output
 
@@ -324,17 +287,14 @@ def get_row_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
-                                                       ROW_PRUNING_ENABLED,
+        output[ROW_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, ROW_PRUNING_ENABLED,
                                                        ROW_PRUNING_ENABLED_DEFAULT)
-        output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                      ROW_PRUNING_METHOD,
-                                                      ROW_PRUNING_METHOD_DEFAULT)
-        assert output[ROW_PRUNING_METHOD] in [ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]"
-        output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            ROW_PRUNING_SCHEDULE_OFFSET,
-            ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        output[ROW_PRUNING_METHOD] = get_scalar_param(sub_param_dict, ROW_PRUNING_METHOD, ROW_PRUNING_METHOD_DEFAULT)
+        assert output[ROW_PRUNING_METHOD] in [
+            ROW_PRUNING_METHOD_L1, ROW_PRUNING_METHOD_TOPK
+        ], f"Invalid row pruning method. Supported types: [{ROW_PRUNING_METHOD_L1}, {ROW_PRUNING_METHOD_TOPK}]"
+        output[ROW_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, ROW_PRUNING_SCHEDULE_OFFSET,
+                                                               ROW_PRUNING_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[ROW_PRUNING_ENABLED] = ROW_PRUNING_ENABLED_DEFAULT
         output[ROW_PRUNING_METHOD] = ROW_PRUNING_METHOD_DEFAULT
@@ -347,22 +307,17 @@ def get_row_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}"
+        assert ROW_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{ROW_PRUNING_DENSE_RATIO} must be specified for row pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
     return output
 
 
@@ -375,7 +330,8 @@ def get_head_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_head_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][HEAD_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Head Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_head_pruning_different_groups(sub_param_dict)
     return output
 
@@ -384,19 +340,18 @@ def get_head_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict,
-                                                        HEAD_PRUNING_ENABLED,
+        output[HEAD_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, HEAD_PRUNING_ENABLED,
                                                         HEAD_PRUNING_ENABLED_DEFAULT)
-        output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict,
-                                                       HEAD_PRUNING_METHOD,
+        output[HEAD_PRUNING_METHOD] = get_scalar_param(sub_param_dict, HEAD_PRUNING_METHOD,
                                                        HEAD_PRUNING_METHOD_DEFAULT)
-        assert output[HEAD_PRUNING_METHOD] in [HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]"
-        output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            HEAD_PRUNING_SCHEDULE_OFFSET,
-            HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        assert output[HEAD_PRUNING_METHOD] in [
+            HEAD_PRUNING_METHOD_L1, HEAD_PRUNING_METHOD_TOPK
+        ], f"Invalid head pruning method. Supported types: [{HEAD_PRUNING_METHOD_L1}, {HEAD_PRUNING_METHOD_TOPK}]"
+        output[HEAD_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, HEAD_PRUNING_SCHEDULE_OFFSET,
+                                                                HEAD_PRUNING_SCHEDULE_OFFSET_DEFAULT)
         if output[HEAD_PRUNING_ENABLED]:
-            assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning"
+            assert HEAD_PRUNING_NUM_HEADS in sub_param_dict.keys(
+            ), f"{HEAD_PRUNING_NUM_HEADS} must be specified for head pruning"
             output[HEAD_PRUNING_NUM_HEADS] = sub_param_dict[HEAD_PRUNING_NUM_HEADS]
     else:
         output[HEAD_PRUNING_ENABLED] = HEAD_PRUNING_ENABLED_DEFAULT
@@ -410,22 +365,17 @@ def get_head_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(), f"dense_ratio must be specified for head pruning group {name}"
+        assert HEAD_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"dense_ratio must be specified for head pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
     return output
 
 
@@ -438,7 +388,8 @@ def get_channel_pruning(param_dict):
     output[SHARED_PARAMETERS] = get_channel_pruning_shared_parameters(sub_param_dict)
     # each sub-groups
     if output[SHARED_PARAMETERS][CHANNEL_PRUNING_ENABLED]:
-        assert DIFFERENT_GROUPS in sub_param_dict.keys(), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
+        assert DIFFERENT_GROUPS in sub_param_dict.keys(
+        ), f"Sparse Pruning is enabled, {DIFFERENT_GROUPS} must be specified"
     output[DIFFERENT_GROUPS] = get_channel_pruning_different_groups(sub_param_dict)
     return output
 
@@ -447,19 +398,15 @@ def get_channel_pruning_shared_parameters(param_dict):
     output = {}
     if SHARED_PARAMETERS in param_dict.keys():
         sub_param_dict = param_dict[SHARED_PARAMETERS]
-        output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_ENABLED,
-            CHANNEL_PRUNING_ENABLED_DEFAULT)
-        output[CHANNEL_PRUNING_METHOD] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_METHOD,
-            CHANNEL_PRUNING_METHOD_DEFAULT)
-        assert output[CHANNEL_PRUNING_METHOD] in [CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]"
-        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(
-            sub_param_dict,
-            CHANNEL_PRUNING_SCHEDULE_OFFSET,
-            CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT)
+        output[CHANNEL_PRUNING_ENABLED] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_ENABLED,
+                                                           CHANNEL_PRUNING_ENABLED_DEFAULT)
+        output[CHANNEL_PRUNING_METHOD] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_METHOD,
+                                                          CHANNEL_PRUNING_METHOD_DEFAULT)
+        assert output[CHANNEL_PRUNING_METHOD] in [
+            CHANNEL_PRUNING_METHOD_L1, CHANNEL_PRUNING_METHOD_TOPK
+        ], f"Invalid channel pruning method. Supported types: [{CHANNEL_PRUNING_METHOD_L1}, {CHANNEL_PRUNING_METHOD_TOPK}]"
+        output[CHANNEL_PRUNING_SCHEDULE_OFFSET] = get_scalar_param(sub_param_dict, CHANNEL_PRUNING_SCHEDULE_OFFSET,
+                                                                   CHANNEL_PRUNING_SCHEDULE_OFFSET_DEFAULT)
     else:
         output[CHANNEL_PRUNING_ENABLED] = CHANNEL_PRUNING_ENABLED_DEFAULT
         output[CHANNEL_PRUNING_METHOD] = CHANNEL_PRUNING_METHOD_DEFAULT
@@ -472,21 +419,16 @@ def get_channel_pruning_different_groups(param_dict):
     sub_param_dict = param_dict[DIFFERENT_GROUPS]
 
     def get_params(name, group_dict):
-        assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}"
+        assert CHANNEL_PRUNING_DENSE_RATIO in group_dict.keys(
+        ), f"{CHANNEL_PRUNING_DENSE_RATIO} must be specified for channel pruning group {name}"
         return group_dict
 
     for k, v in sub_param_dict.items():
         output[k] = {}
-        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(
-            k,
-            sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
-        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_MODULE_SCOPE,
-            DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
+        output[k][DIFFERENT_GROUPS_PARAMETERS] = get_params(k, sub_param_dict[k][DIFFERENT_GROUPS_PARAMETERS])
+        output[k][DIFFERENT_GROUPS_MODULE_SCOPE] = get_scalar_param(sub_param_dict[k], DIFFERENT_GROUPS_MODULE_SCOPE,
+                                                                    DIFFERENT_GROUPS_MODULE_SCOPE_DEFAULT)
         output[k][DIFFERENT_GROUPS_RELATED_MODULE_SCOPE] = get_scalar_param(
-            sub_param_dict[k],
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE,
-            DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
+            sub_param_dict[k], DIFFERENT_GROUPS_RELATED_MODULE_SCOPE, DIFFERENT_GROUPS_RELATED_MODULE_SCOPE_DEFAULT)
 
     return output
diff --git a/deepspeed/compression/constants.py b/deepspeed/compression/constants.py
index 593b86e5f5c9cac03edee0d87298aad8d09d41e9..18268e3bbae40bbf045fc61df8af5e321dafb902 100644
--- a/deepspeed/compression/constants.py
+++ b/deepspeed/compression/constants.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # Compression Methods
diff --git a/deepspeed/compression/helper.py b/deepspeed/compression/helper.py
index e839a5d035823725266f7ca85cc8ab693e91de53..fdca916e9f15cee2506364076ad37ae5c8364a9b 100644
--- a/deepspeed/compression/helper.py
+++ b/deepspeed/compression/helper.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from .basic_layer import Embedding_Compress, LinearLayer_Compress, Conv2dLayer_Compress, BNLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
@@ -59,23 +62,17 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
         need_bias = True
 
     # Initialize the new module
-    if isinstance(old_module,
-                  LinearLayer_Compress) or isinstance(old_module,
-                                                      torch.nn.Linear):
+    if isinstance(old_module, LinearLayer_Compress) or isinstance(old_module, torch.nn.Linear):
         if isinstance(old_module, LinearLayer_Compress):
             new_module = old_module
         else:
-            new_module = LinearLayer_Compress(old_module.in_features,
-                                              old_module.out_features,
-                                              bias=need_bias).to(
-                                                  device=old_module.weight.device,
-                                                  dtype=old_module.weight.dtype)
+            new_module = LinearLayer_Compress(old_module.in_features, old_module.out_features,
+                                              bias=need_bias).to(device=old_module.weight.device,
+                                                                 dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
-    elif isinstance(old_module,
-                    Conv2dLayer_Compress) or isinstance(old_module,
-                                                        torch.nn.Conv2d):
+    elif isinstance(old_module, Conv2dLayer_Compress) or isinstance(old_module, torch.nn.Conv2d):
         if isinstance(old_module, Conv2dLayer_Compress):
             new_module = old_module
         else:
@@ -86,60 +83,48 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
             if need_bias:
                 new_module.bias.data = old_module.bias.data
     elif isinstance(old_module, torch.nn.BatchNorm2d):
-        new_module = BNLayer_Compress(old_module.num_features,
-                                      old_module.eps,
-                                      old_module.momentum,
-                                      old_module.affine,
-                                      old_module.track_running_stats).to(
-                                          old_module.weight.device,
-                                          old_module.weight.dtype)
+        new_module = BNLayer_Compress(old_module.num_features, old_module.eps, old_module.momentum, old_module.affine,
+                                      old_module.track_running_stats).to(old_module.weight.device,
+                                                                         old_module.weight.dtype)
         new_module.weight.data = old_module.weight.data
         if need_bias:
             new_module.bias.data = old_module.bias.data
         new_module.running_mean.data = old_module.running_mean.data
         new_module.running_var.data = old_module.running_var.data
-    elif isinstance(old_module,
-                    Embedding_Compress) or isinstance(old_module,
-                                                      torch.nn.Embedding):
+    elif isinstance(old_module, Embedding_Compress) or isinstance(old_module, torch.nn.Embedding):
         if isinstance(old_module, Embedding_Compress):
             new_module = old_module
         else:
             new_module = Embedding_Compress(old_module.num_embeddings, old_module.embedding_dim, old_module.padding_idx, old_module.max_norm, old_module.norm_type, \
                                         old_module.scale_grad_by_freq, old_module.sparse).to(device=old_module.weight.device, dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
-    elif mpu is not None and (isinstance(old_module,
-                                         ColumnParallelLinear_Compress)
-                              or isinstance(old_module,
-                                            mpu.ColumnParallelLinear)):
+    elif mpu is not None and (isinstance(old_module, ColumnParallelLinear_Compress)
+                              or isinstance(old_module, mpu.ColumnParallelLinear)):
         if isinstance(old_module, ColumnParallelLinear_Compress):
             new_module = old_module
         else:
-            new_module = ColumnParallelLinear_Compress(
-                mpu,
-                old_module.input_size,
-                old_module.output_size,
-                gather_output=old_module.gather_output,
-                skip_bias_add=old_module.skip_bias_add,
-                bias=need_bias).to(device=old_module.weight.device,
-                                   dtype=old_module.weight.dtype)
+            new_module = ColumnParallelLinear_Compress(mpu,
+                                                       old_module.input_size,
+                                                       old_module.output_size,
+                                                       gather_output=old_module.gather_output,
+                                                       skip_bias_add=old_module.skip_bias_add,
+                                                       bias=need_bias).to(device=old_module.weight.device,
+                                                                          dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
-    elif mpu is not None and (isinstance(old_module,
-                                         RowParallelLinear_Compress)
-                              or isinstance(old_module,
-                                            mpu.RowParallelLinear)):
+    elif mpu is not None and (isinstance(old_module, RowParallelLinear_Compress)
+                              or isinstance(old_module, mpu.RowParallelLinear)):
         if isinstance(old_module, RowParallelLinear_Compress):
             new_module = old_module
         else:
-            new_module = RowParallelLinear_Compress(
-                mpu,
-                old_module.input_size,
-                old_module.output_size,
-                input_is_parallel=old_module.input_is_parallel,
-                skip_bias_add=old_module.skip_bias_add,
-                bias=need_bias).to(device=old_module.weight.device,
-                                   dtype=old_module.weight.dtype)
+            new_module = RowParallelLinear_Compress(mpu,
+                                                    old_module.input_size,
+                                                    old_module.output_size,
+                                                    input_is_parallel=old_module.input_is_parallel,
+                                                    skip_bias_add=old_module.skip_bias_add,
+                                                    bias=need_bias).to(device=old_module.weight.device,
+                                                                       dtype=old_module.weight.dtype)
             new_module.weight.data = old_module.weight.data
             if need_bias:
                 new_module.bias.data = old_module.bias.data
@@ -150,39 +135,30 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
         for k, v in compression_technique.items():
             if k == SPARSE_PRUNING:
                 if v[SPARSE_PRUNING_ENABLED]:
-                    new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO],
-                                                     v[SPARSE_PRUNING_METHOD])
+                    new_module.enable_sparse_pruning(v[SPARSE_PRUNING_DENSE_RATIO], v[SPARSE_PRUNING_METHOD])
             elif k == ROW_PRUNING:
                 if v[ROW_PRUNING_ENABLED]:
-                    new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO],
-                                                  v[ROW_PRUNING_METHOD])
+                    new_module.enable_row_pruning(v[ROW_PRUNING_DENSE_RATIO], v[ROW_PRUNING_METHOD])
             elif k == HEAD_PRUNING:
                 if v[HEAD_PRUNING_ENABLED]:
-                    new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO],
-                                                   v[HEAD_PRUNING_METHOD],
+                    new_module.enable_head_pruning(v[HEAD_PRUNING_DENSE_RATIO], v[HEAD_PRUNING_METHOD],
                                                    v[HEAD_PRUNING_NUM_HEADS])
             elif k == ACTIVATION_QUANTIZATION:
                 if v[ACTIVATION_QUANTIZATION_ENABLED]:
-                    new_module.enable_activation_quantization(
-                        v[ACTIVATION_QUANTIZE_BITS],
-                        v[ACTIVATION_QUANTIZE_TYPE],
-                        v[ACTIVATION_QUANTIZE_RANGE])
+                    new_module.enable_activation_quantization(v[ACTIVATION_QUANTIZE_BITS], v[ACTIVATION_QUANTIZE_TYPE],
+                                                              v[ACTIVATION_QUANTIZE_RANGE])
             elif k == WEIGHT_QUANTIZATION:
                 if v[WEIGHT_QUANTIZE_ENABLED]:
-                    new_module.enable_weight_quantization(
-                        v[WEIGHT_QUANTIZE_START_BITS],
-                        v[WEIGHT_QUANTIZE_TARGET_BITS],
-                        v[WEIGHT_QUANTIZATION_PERIOD],
-                        v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
-                        v[WEIGHT_QUANTIZE_TYPE],
-                        v[WEIGHT_QUANTIZE_GROUPS])
+                    new_module.enable_weight_quantization(v[WEIGHT_QUANTIZE_START_BITS],
+                                                          v[WEIGHT_QUANTIZE_TARGET_BITS],
+                                                          v[WEIGHT_QUANTIZATION_PERIOD],
+                                                          v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
+                                                          v[WEIGHT_QUANTIZE_TYPE], v[WEIGHT_QUANTIZE_GROUPS])
             elif k == CHANNEL_PRUNING:
                 if v[CHANNEL_PRUNING_ENABLED]:
-                    new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO],
-                                                      v[CHANNEL_PRUNING_METHOD])
+                    new_module.enable_channel_pruning(v[CHANNEL_PRUNING_DENSE_RATIO], v[CHANNEL_PRUNING_METHOD])
             else:
-                raise NotImplementedError(
-                    'Compression technique {} is not implemented'.format(k))
+                raise NotImplementedError('Compression technique {} is not implemented'.format(k))
 
     # Replace the old module with the new one
     recursive_setattr(model, module_name, new_module)
@@ -195,10 +171,7 @@ def is_module_compressible(module, mpu=None):
           isinstance(module, torch.nn.BatchNorm2d)
 
     if mpu is not None:
-        ret = ret or isinstance(module,
-                                mpu.RowParallelLinear) or isinstance(
-                                    module,
-                                    mpu.ColumnParallelLinear)
+        ret = ret or isinstance(module, mpu.RowParallelLinear) or isinstance(module, mpu.ColumnParallelLinear)
 
     return ret
 
@@ -225,11 +198,7 @@ def compression_preparation(model, compression_techinique_list, mpu):
     return model
 
 
-def fix_compression(model,
-                    module_name,
-                    compression_technique,
-                    mask=None,
-                    dim_reduction=False):
+def fix_compression(model, module_name, compression_technique, mask=None, dim_reduction=False):
     """
     Fix the compression technique of a module.
     Args:
@@ -243,17 +212,14 @@ def fix_compression(model,
     # Here we can make things much simpler by just replacing the module
     module = recursive_getattr(model, module_name)
     for k, v in compression_technique.items():
-        if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[
-                WEIGHT_QUANTIZE_ENABLED]:
+        if k == WEIGHT_QUANTIZATION and v[WEIGHT_QUANTIZE_IN_FORWARD_ENABLED] and v[WEIGHT_QUANTIZE_ENABLED]:
             return module.fix_weight_quantization()
         elif k == SPARSE_PRUNING and v[SPARSE_PRUNING_ENABLED]:
             return module.fix_sparse_pruning_helper()
         elif k == ROW_PRUNING and (v[ROW_PRUNING_ENABLED] or mask is not None):
             return module.fix_row_col_pruning_helper(mask, dim_reduction=dim_reduction)
         elif k == HEAD_PRUNING and (v[HEAD_PRUNING_ENABLED] or mask is not None):
-            return module.fix_head_pruning_helper(mask,
-                                                  v[HEAD_PRUNING_NUM_HEADS],
-                                                  dim_reduction=dim_reduction)
+            return module.fix_head_pruning_helper(mask, v[HEAD_PRUNING_NUM_HEADS], dim_reduction=dim_reduction)
         elif k == CHANNEL_PRUNING and (v[CHANNEL_PRUNING_ENABLED] or mask is not None):
             return module.fix_channel_pruning_helper(mask, dim_reduction=dim_reduction)
 
@@ -270,10 +236,9 @@ def convert_conv1d_to_linear(model, convert_type):
     for name, module in c_model.named_modules():
         if isinstance(module, convert_type):
             old_module = recursive_getattr(c_model, name)
-            new_module = torch.nn.Linear(
-                old_module.weight.data.size(0),
-                old_module.weight.data.size(1),
-                bias=True if old_module.bias is not None else False)
+            new_module = torch.nn.Linear(old_module.weight.data.size(0),
+                                         old_module.weight.data.size(1),
+                                         bias=True if old_module.bias is not None else False)
             new_module.weight.data = old_module.weight.data.t().contiguous()
             if new_module.bias is not None:
                 new_module.bias.data = old_module.bias.data.view(-1)
diff --git a/deepspeed/compression/scheduler.py b/deepspeed/compression/scheduler.py
index 67955a825251c468c361be7dce4e9fd61d02245e..582ecd8f6f5e719378b91842feda92b74de3526d 100644
--- a/deepspeed/compression/scheduler.py
+++ b/deepspeed/compression/scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .compress import get_module_name
 from .constants import *
@@ -10,6 +13,7 @@ class compression_scheduler():
     '''
     Used to schedule different compression methods
     '''
+
     def __init__(self, model, compression_config):
         self.model = model
         self.compression_config = compression_config
@@ -38,22 +42,22 @@ class compression_scheduler():
             }
             exist_module_name = set()
             shared_parameters = method_content[SHARED_PARAMETERS]
-            self.different_compression_methods[method][
-                TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED]
-            self.different_compression_methods[method][
-                SHARED_PARAMETERS] = shared_parameters
+            self.different_compression_methods[method][TECHNIQUE_ENABLED] = shared_parameters[TECHNIQUE_ENABLED]
+            self.different_compression_methods[method][SHARED_PARAMETERS] = shared_parameters
 
             for group_name, method_parameters in method_content[DIFFERENT_GROUPS].items():
                 module_name_list = []
                 for key_word in method_parameters[DIFFERENT_GROUPS_MODULE_SCOPE]:
-                    module_name, exist_module_name = get_module_name(group_name, self.model, key_word, exist_module_name, verbose=False)
+                    module_name, exist_module_name = get_module_name(group_name,
+                                                                     self.model,
+                                                                     key_word,
+                                                                     exist_module_name,
+                                                                     verbose=False)
                     module_name_list.extend(module_name)
                 if module_name_list:
-                    self.different_compression_methods[method][DIFFERENT_GROUPS].append([
-                        group_name,
-                        module_name_list,
-                        method_parameters.copy().pop('params')
-                    ])
+                    self.different_compression_methods[method][DIFFERENT_GROUPS].append(
+                        [group_name, module_name_list,
+                         method_parameters.copy().pop('params')])
 
     def check_weight_quantization(self):
         # check weight quantization
@@ -69,8 +73,7 @@ class compression_scheduler():
                         module.weight_quantization_enabled = True
 
                 if not self.verbose[WEIGHT_QUANTIZATION]:
-                    logger.info(
-                        f'Weight quantization is enabled at step {self.training_steps}')
+                    logger.info(f'Weight quantization is enabled at step {self.training_steps}')
                     self.weight_quantization_enabled = True
                     self.verbose[WEIGHT_QUANTIZATION] = True
 
@@ -87,9 +90,7 @@ class compression_scheduler():
                         module = recursive_getattr(self.model, module_name)
                         module.activation_quantization_enabled = True
                 if not self.verbose[ACTIVATION_QUANTIZATION]:
-                    logger.info(
-                        f'Activation quantization is enabled at step {self.training_steps}'
-                    )
+                    logger.info(f'Activation quantization is enabled at step {self.training_steps}')
                     self.verbose[ACTIVATION_QUANTIZATION] = True
 
     def check_sparse_pruning(self):
@@ -105,8 +106,7 @@ class compression_scheduler():
                         module = recursive_getattr(self.model, module_name)
                         module.sparse_pruning_enabled = True
                 if not self.verbose[SPARSE_PRUNING]:
-                    logger.info(
-                        f'Sparse pruning is enabled at step {self.training_steps}')
+                    logger.info(f'Sparse pruning is enabled at step {self.training_steps}')
                     self.verbose[SPARSE_PRUNING] = True
 
     def check_head_pruning(self):
@@ -154,8 +154,7 @@ class compression_scheduler():
                         module = recursive_getattr(self.model, module_name)
                         module.channel_pruning_enabled = True
                 if not self.verbose[CHANNEL_PRUNING]:
-                    logger.info(
-                        f'Channel pruning is enabled at step {self.training_steps}')
+                    logger.info(f'Channel pruning is enabled at step {self.training_steps}')
                     self.verbose[CHANNEL_PRUNING] = True
 
     def check_all_modules(self):
diff --git a/deepspeed/compression/utils.py b/deepspeed/compression/utils.py
index 2492c8ce9d861aa8beb2ed3cac267e4dcf8b50c3..3534f994cd788379d090e61f3ba9a08eaec66c37 100644
--- a/deepspeed/compression/utils.py
+++ b/deepspeed/compression/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch import autograd
@@ -13,6 +16,7 @@ class TopKBinarizer(autograd.Function):
     Implementation is inspired from:
         https://github.com/yaozhewei/MLPruning
     """
+
     @staticmethod
     def forward(ctx, inputs: torch.tensor, threshold: float, sigmoid: bool):
         """
@@ -59,6 +63,7 @@ class SymQuantizer(torch.autograd.Function):
     """
     Symmetric quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -75,9 +80,8 @@ class SymQuantizer(torch.autograd.Function):
             quantized_input (`torch.FloatTensor`)
                 Quantized input
         """
-        assert (min_value is None
-                and max_value is None) or (min_value is not None
-                                           and max_value is not None and num_groups == 1)
+        assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None
+                                                             and num_groups == 1)
         q_range = 2**num_bits
         input_shape = input.shape
         if min_value is None:
@@ -101,6 +105,7 @@ class AsymQuantizer(torch.autograd.Function):
     """
     Asymmetric quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -118,9 +123,8 @@ class AsymQuantizer(torch.autograd.Function):
                 Quantized input
         """
 
-        assert (min_value is None
-                and max_value is None) or (min_value is not None
-                                           and max_value is not None and num_groups == 1)
+        assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None
+                                                             and num_groups == 1)
         q_range = 2**num_bits
         input_shape = input.shape
         if min_value is None:
@@ -131,9 +135,7 @@ class AsymQuantizer(torch.autograd.Function):
         scale = (max_value - min_value) / q_range
         zero_point = (min_value / scale).round() * scale
 
-        output = (
-            (input - zero_point) / scale).round().clamp(0,
-                                                        q_range - 1) * scale + zero_point
+        output = ((input - zero_point) / scale).round().clamp(0, q_range - 1) * scale + zero_point
         output = output.reshape(input_shape).contiguous()
         return output
 
@@ -147,6 +149,7 @@ class TernaryQuantizer(torch.autograd.Function):
     """
     Ternary quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
@@ -187,6 +190,7 @@ class BinaryQuantizer(torch.autograd.Function):
     """
     Binary quantization
     """
+
     @staticmethod
     def forward(ctx, input, num_bits, min_value=None, max_value=None, num_groups=1):
         """
diff --git a/deepspeed/constants.py b/deepspeed/constants.py
index 9e7f511e443ffeb9279869e12b834139417f386f..7ebc8f9983a593de95c5b37056df73debe106df7 100644
--- a/deepspeed/constants.py
+++ b/deepspeed/constants.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from datetime import timedelta
 
 #############################################
diff --git a/deepspeed/elasticity/__init__.py b/deepspeed/elasticity/__init__.py
index be95e3a3ee1935a13063efe4fcf128451f44775b..66bda96fa6bafa90c8a68f4033b665220880bcf6 100644
--- a/deepspeed/elasticity/__init__.py
+++ b/deepspeed/elasticity/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config
 from .utils import is_torch_elastic_compatible
diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py
index ffbce7028e03204f5d42fee92d3ffcf915b33fa5..9c574d3537c8add37f71c8cd744ba0d3151bf3b1 100644
--- a/deepspeed/elasticity/config.py
+++ b/deepspeed/elasticity/config.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import json
 from .constants import *
@@ -43,77 +44,64 @@ class ElasticityConfig:
         "version": 0.1
     }
     """
+
     def __init__(self, param_dict):
         self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT)
         if self.enabled:
             if MAX_ACCEPTABLE_BATCH_SIZE in param_dict:
                 self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE]
             else:
-                raise ElasticityConfigError(
-                    f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
+                raise ElasticityConfigError(f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
             if MICRO_BATCHES in param_dict:
                 self.micro_batches = param_dict[MICRO_BATCHES]
             else:
                 raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}")
         else:
-            self.max_acceptable_batch_size = param_dict.get(
-                MAX_ACCEPTABLE_BATCH_SIZE,
-                MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
+            self.max_acceptable_batch_size = param_dict.get(MAX_ACCEPTABLE_BATCH_SIZE,
+                                                            MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
             self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)
 
         if not isinstance(self.micro_batches, list):
             raise ElasticityConfigError(
                 f"Elasticity expected value of {MICRO_BATCHES} to be a "
-                f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}"
-            )
+                f"list of micro batches, instead is: {type(self.micro_batches)}, containing: {self.micro_batches}")
 
         if not all(map(lambda m: isinstance(m, int), self.micro_batches)):
-            raise ElasticityConfigError(
-                f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
-                f"instead contains: f{self.micro_batches}")
+            raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain a list of integers, "
+                                        f"instead contains: f{self.micro_batches}")
 
         if not all(map(lambda m: m > 0, self.micro_batches)):
-            raise ElasticityConfigError(
-                f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
-                f"instead contains: f{self.micro_batches}")
+            raise ElasticityConfigError(f"Elasticity expected {MICRO_BATCHES} to only contain positive integers, "
+                                        f"instead contains: f{self.micro_batches}")
 
         self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
         self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
 
         if self.min_gpus < 1 or self.max_gpus < 1:
-            raise ElasticityConfigError(
-                "Elasticity min/max gpus must be > 0, "
-                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+            raise ElasticityConfigError("Elasticity min/max gpus must be > 0, "
+                                        f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
         if self.max_gpus < self.min_gpus:
-            raise ElasticityConfigError(
-                "Elasticity min_gpus cannot be greater than max_gpus, "
-                f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
+            raise ElasticityConfigError("Elasticity min_gpus cannot be greater than max_gpus, "
+                                        f"given min_gpus: {self.min_gpus}, max_gpus: {self.max_gpus}")
 
-        self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE,
-                                                  MODEL_PARLLEL_SIZE_DEFAULT)
+        self.model_parallel_size = param_dict.get(MODEL_PARLLEL_SIZE, MODEL_PARLLEL_SIZE_DEFAULT)
         if self.model_parallel_size < 1:
-            raise ElasticityConfigError(
-                "Model-Parallel size cannot be less than 1, "
-                f"given model-parallel size: {self.model_parallel_size}")
+            raise ElasticityConfigError("Model-Parallel size cannot be less than 1, "
+                                        f"given model-parallel size: {self.model_parallel_size}")
 
-        self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE,
-                                                NUM_GPUS_PER_NODE_DEFAULT)
+        self.num_gpus_per_node = param_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT)
         if self.num_gpus_per_node < 1:
-            raise ElasticityConfigError(
-                "Number of GPUs per node cannot be less than 1, "
-                f"given number of GPUs per node: {self.num_gpus_per_node}")
+            raise ElasticityConfigError("Number of GPUs per node cannot be less than 1, "
+                                        f"given number of GPUs per node: {self.num_gpus_per_node}")
 
         self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
         if self.min_time < 0:
-            raise ElasticityConfigError(
-                f"Elasticity min time needs to be >= 0: given {self.min_time}")
+            raise ElasticityConfigError(f"Elasticity min time needs to be >= 0: given {self.min_time}")
 
         self.version = param_dict.get(VERSION, VERSION_DEFAULT)
-        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
-                                                       PREFER_LARGER_BATCH_DEFAULT)
-        self.ignore_non_elastic_batch_info = param_dict.get(
-            IGNORE_NON_ELASTIC_BATCH_INFO,
-            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, PREFER_LARGER_BATCH_DEFAULT)
+        self.ignore_non_elastic_batch_info = param_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO,
+                                                            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
 
     def repr(self):
         return self.__dict__
diff --git a/deepspeed/elasticity/constants.py b/deepspeed/elasticity/constants.py
index eb40edc84711b1e64cfdebf367b960c7a8c84664..85a38c749b2a919ed40c25c31e0986a4dd502349 100644
--- a/deepspeed/elasticity/constants.py
+++ b/deepspeed/elasticity/constants.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # Elasticity
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index db0a61b4fa7ae9766fc52e10f2291883f5164597..91a217cb99712230745c7706402034d62322b82b 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
 from typing import Any, Dict, Optional, Tuple
@@ -23,6 +26,7 @@ import subprocess
 
 
 class DSElasticAgent(LocalElasticAgent):
+
     def __init__(
         self,
         spec: WorkerSpec,
@@ -35,9 +39,7 @@ class DSElasticAgent(LocalElasticAgent):
         self.ds_env = env
 
     @staticmethod
-    def _set_master_addr_port(store: Store,
-                              master_addr: Optional[str],
-                              master_port: Optional[int]):
+    def _set_master_addr_port(store: Store, master_addr: Optional[str], master_port: Optional[int]):
         if master_port is None:
             sock = _get_socket_with_port()
             with closing(sock):
@@ -82,8 +84,7 @@ class DSElasticAgent(LocalElasticAgent):
                 "TORCHELASTIC_MAX_RESTARTS": str(spec.max_restarts),
                 "TORCHELASTIC_RUN_ID": spec.rdzv_handler.get_run_id(),
                 "TORCHELASTIC_USE_AGENT_STORE": str(use_agent_store),
-                "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING",
-                                                       str(1)),
+                "NCCL_ASYNC_ERROR_HANDLING": os.getenv("NCCL_ASYNC_ERROR_HANDLING", str(1)),
             }
             worker_env_ds.update(worker_env_elastic)
             if "OMP_NUM_THREADS" in os.environ:
@@ -120,8 +121,7 @@ class DSElasticAgent(LocalElasticAgent):
         spec = self._worker_group.spec
         role = spec.role
 
-        log.info(
-            f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}")
+        log.info(f"[{role}] starting workers for entrypoint: {spec.get_entrypoint_name()}")
 
         self._initialize_workers(self._worker_group)
         monitor_interval = spec.monitor_interval
@@ -136,13 +136,10 @@ class DSElasticAgent(LocalElasticAgent):
             state = run_result.state
             self._worker_group.state = state
 
-            expire_time = datetime.utcnow() - (
-                rdzv_handler._settings.keep_alive_interval *
-                rdzv_handler._settings.keep_alive_max_attempt)
+            expire_time = datetime.utcnow() - (rdzv_handler._settings.keep_alive_interval *
+                                               rdzv_handler._settings.keep_alive_max_attempt)
             _dead_nodes = [
-                node for node,
-                last_heartbeat in
-                rdzv_handler._state_holder.state.last_heartbeats.items()
+                node for node, last_heartbeat in rdzv_handler._state_holder.state.last_heartbeats.items()
                 if last_heartbeat < expire_time
             ]
 
@@ -150,21 +147,16 @@ class DSElasticAgent(LocalElasticAgent):
             put_metric(f"workers.{role}.{state.name.lower()}", 1)
 
             if state == WorkerState.SUCCEEDED:
-                log.info(
-                    f"[{role}] worker group successfully finished."
-                    f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish."
-                )
+                log.info(f"[{role}] worker group successfully finished."
+                         f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.")
                 self._exit_barrier()
                 return run_result
-            elif state in {
-                    WorkerState.UNHEALTHY,
-                    WorkerState.FAILED
-            } or len(participants) > len(rdzv_handler._state_holder.state.participants):
+            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED
+                           } or len(participants) > len(rdzv_handler._state_holder.state.participants):
                 if self._remaining_restarts > 0:
-                    log.info(
-                        f"[{role}] Worker group {state.name}. "
-                        f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
-                        f" will restart worker group")
+                    log.info(f"[{role}] Worker group {state.name}. "
+                             f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
+                             f" will restart worker group")
                     self._remaining_restarts -= 1
                     # rdzv_handler._state_holder.state.restart = False
                     self._restart_workers(self._worker_group)
diff --git a/deepspeed/elasticity/elasticity.py b/deepspeed/elasticity/elasticity.py
index 17a8b6ecf39466869a8254d5c19ba49ea39bd741..730f3acdc6a5bdb633a1fd76dd8af9b0b13c88d9 100644
--- a/deepspeed/elasticity/elasticity.py
+++ b/deepspeed/elasticity/elasticity.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import json
 import numpy as np
@@ -17,44 +19,8 @@ from ..utils import logger
 # Thirty eight smallest highly composite numbers. The list should
 # be enough to support up to 720K batch size.
 HCN_LIST = [
-    1,
-    2,
-    4,
-    6,
-    12,
-    24,
-    36,
-    48,
-    60,
-    120,
-    180,
-    240,
-    360,
-    720,
-    840,
-    1260,
-    1680,
-    2520,
-    5040,
-    7560,
-    10080,
-    15120,
-    20160,
-    25200,
-    27720,
-    45360,
-    50400,
-    55440,
-    83160,
-    110880,
-    166320,
-    221760,
-    277200,
-    332640,
-    498960,
-    554400,
-    665280,
-    720720
+    1, 2, 4, 6, 12, 24, 36, 48, 60, 120, 180, 240, 360, 720, 840, 1260, 1680, 2520, 5040, 7560, 10080, 15120, 20160,
+    25200, 27720, 45360, 50400, 55440, 83160, 110880, 166320, 221760, 277200, 332640, 498960, 554400, 665280, 720720
 ]
 
 
@@ -94,11 +60,7 @@ def get_valid_gpus(batch_size, micro_batches, min_valid_gpus, max_valid_gpus):
     return valid_gpus
 
 
-def get_best_candidates(candidate_batch_sizes,
-                        micro_batches,
-                        min_gpus,
-                        max_gpus,
-                        prefer_larger):
+def get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus, prefer_larger):
 
     max_valid_gpus = 0
     valid_gpus = None
@@ -106,15 +68,11 @@ def get_best_candidates(candidate_batch_sizes,
 
     for batch_size in candidate_batch_sizes:
 
-        current_valid_gpus = get_valid_gpus(batch_size,
-                                            micro_batches,
-                                            min_gpus,
-                                            max_gpus)
+        current_valid_gpus = get_valid_gpus(batch_size, micro_batches, min_gpus, max_gpus)
 
-        if (len(current_valid_gpus) > max_valid_gpus
-                or (len(current_valid_gpus) == max_valid_gpus and
-                    ((prefer_larger and batch_size > final_batch_size) or
-                     (not prefer_larger and batch_size < final_batch_size)))):
+        if (len(current_valid_gpus) > max_valid_gpus or (len(current_valid_gpus) == max_valid_gpus and
+                                                         ((prefer_larger and batch_size > final_batch_size) or
+                                                          (not prefer_larger and batch_size < final_batch_size)))):
             max_valid_gpus = len(current_valid_gpus)
             valid_gpus = current_valid_gpus
             final_batch_size = batch_size
@@ -157,15 +115,10 @@ def _get_compatible_gpus_v01(micro_batches,
     base_list.extend(micro_batches)
     base_list.append(lcm)
 
-    candidate_batch_sizes = get_candidate_batch_sizes(base_list,
-                                                      max_acceptable_batch_size)
+    candidate_batch_sizes = get_candidate_batch_sizes(base_list, max_acceptable_batch_size)
 
-    final_batch_size, valid_gpus = get_best_candidates(
-        candidate_batch_sizes,
-        micro_batches,
-        min_gpus,
-        max_gpus,
-        prefer_larger)
+    final_batch_size, valid_gpus = get_best_candidates(candidate_batch_sizes, micro_batches, min_gpus, max_gpus,
+                                                       prefer_larger)
 
     return final_batch_size, valid_gpus
 
@@ -203,11 +156,12 @@ def _get_compatible_gpus_v02(micro_batches,
 
     dp_size_per_node = num_gpus_per_node // model_parallel_size
 
-    final_batch_size, valid_world_size = _get_compatible_gpus_v01(micro_batches,
-                             int(max_acceptable_batch_size/dp_size_per_node),
-                             int(min_gpus/num_gpus_per_node),
-                             int(max_gpus/num_gpus_per_node), # Passing number of max nodes as Elasticity v2 works at node level
-                             prefer_larger=prefer_larger)
+    final_batch_size, valid_world_size = _get_compatible_gpus_v01(
+        micro_batches,
+        int(max_acceptable_batch_size / dp_size_per_node),
+        int(min_gpus / num_gpus_per_node),
+        int(max_gpus / num_gpus_per_node),  # Passing number of max nodes as Elasticity v2 works at node level
+        prefer_larger=prefer_larger)
 
     final_batch_size = int(final_batch_size) * dp_size_per_node
     valid_dp_world_size = [i * dp_size_per_node for i in valid_world_size]
@@ -256,38 +210,27 @@ def ensure_immutable_elastic_config(runtime_elastic_config_dict: dict):
     Ensure the resource scheduler saw the same elastic config we are using at runtime
     """
     if DEEPSPEED_ELASTICITY_CONFIG in os.environ:
-        scheduler_elastic_config_dict = json.loads(
-            os.environ[DEEPSPEED_ELASTICITY_CONFIG])
+        scheduler_elastic_config_dict = json.loads(os.environ[DEEPSPEED_ELASTICITY_CONFIG])
         scheduler_elastic_config = ElasticityConfig(scheduler_elastic_config_dict)
         runtime_elastic_config = ElasticityConfig(runtime_elastic_config_dict)
         err_str = "Elastic config '{}={}' seen by resource scheduler does not match config passed to runtime {}={}"
         if runtime_elastic_config.max_acceptable_batch_size != scheduler_elastic_config.max_acceptable_batch_size:
             raise ElasticityConfigError(
-                err_str.format('max_acceptable_batch_size',
-                               scheduler_elastic_config.max_acceptable_batch_size,
-                               'max_acceptable_batch_size',
-                               runtime_elastic_config.max_acceptable_batch_size))
+                err_str.format('max_acceptable_batch_size', scheduler_elastic_config.max_acceptable_batch_size,
+                               'max_acceptable_batch_size', runtime_elastic_config.max_acceptable_batch_size))
         if runtime_elastic_config.micro_batches != scheduler_elastic_config.micro_batches:
             raise ElasticityConfigError(
-                err_str.format('micro_batches',
-                               scheduler_elastic_config.micro_batches,
-                               'micro_batches',
+                err_str.format('micro_batches', scheduler_elastic_config.micro_batches, 'micro_batches',
                                runtime_elastic_config.micro_batches))
         if runtime_elastic_config.version != scheduler_elastic_config.version:
             raise ElasticityConfigError(
-                err_str.format('version',
-                               scheduler_elastic_config.version,
-                               'version',
-                               runtime_elastic_config.version))
+                err_str.format('version', scheduler_elastic_config.version, 'version', runtime_elastic_config.version))
     else:
         logger.warning("Unable to find DEEPSPEED_ELASTICITY_CONFIG environment variable, cannot " \
             "guarantee resource scheduler will scale this job using compatible GPU counts.")
 
 
-def compute_elastic_config(ds_config: dict,
-                           target_deepspeed_version: str,
-                           world_size=0,
-                           return_microbatch=False):
+def compute_elastic_config(ds_config: dict, target_deepspeed_version: str, world_size=0, return_microbatch=False):
     """Core deepspeed elasticity API. Given an elastic config (similar to the example below)
     DeepSpeed will compute a total train batch size corresponding valid GPU count list that
     provides a high level of elasticity. Elasticity in this case means we are safe to scale
@@ -397,8 +340,7 @@ def compute_elastic_config(ds_config: dict,
         # ensure batch size is int dtype
         final_batch_size = int(final_batch_size)
     else:
-        raise NotImplementedError(
-            f"Unable to find elastic logic for version: {elastic_config.version}")
+        raise NotImplementedError(f"Unable to find elastic logic for version: {elastic_config.version}")
 
     logger.info(f"Valid World Size (GPUs / Model Parallel Size): {valid_gpus}")
 
diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py
index 3fba6949be032775581cfa00203d98681c4c623a..43f2cdb2918ce5c104addb25cbb23cadf44767e2 100644
--- a/deepspeed/elasticity/utils.py
+++ b/deepspeed/elasticity/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index e8be40a0535d576434b1cacf13991f2b201c199a..a42abe0ac874e3d6148d70566859ddd5511ed01b 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -48,8 +51,7 @@ def op_report(verbose=True):
         dots = "." * (max_dots - len(op_name))
         is_compatible = OKAY if builder.is_compatible(verbose) else no
         is_installed = installed if installed_ops[op_name] else no
-        dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) -
-                       (len(is_installed) - color_len))
+        dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len))
         print(op_name, dots, is_installed, dots2, is_compatible)
     print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
 
@@ -68,9 +70,7 @@ def nvcc_version():
     if cuda_home is None:
         return f"{RED} [FAIL] cannot find CUDA_HOME via torch.utils.cpp_extension.CUDA_HOME={torch.utils.cpp_extension.CUDA_HOME} {END}"
     try:
-        output = subprocess.check_output([cuda_home + "/bin/nvcc",
-                                          "-V"],
-                                         universal_newlines=True)
+        output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
     except FileNotFoundError:
         return f"{RED} [FAIL] nvcc missing {END}"
     output_split = output.split()
@@ -82,32 +82,18 @@ def nvcc_version():
 def debug_report():
     max_dots = 33
 
-    report = [
-        ("torch install path",
-         torch.__path__),
-        ("torch version",
-         torch.__version__),
-        ("deepspeed install path",
-         deepspeed.__path__),
-        ("deepspeed info",
-         f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
-         )
-    ]
+    report = [("torch install path", torch.__path__), ("torch version", torch.__version__),
+              ("deepspeed install path", deepspeed.__path__),
+              ("deepspeed info", f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}")]
     if get_accelerator().device_name() == 'cuda':
         hip_version = getattr(torch.version, "hip", None)
-        report.extend([("torch cuda version",
-                        torch.version.cuda),
-                       ("torch hip version",
-                        hip_version),
-                       ("nvcc version",
-                        (None if hip_version else nvcc_version())),
-                       ("deepspeed wheel compiled w.",
-                        f"torch {torch_info['version']}, " +
-                        (f"hip {torch_info['hip_version']}"
-                         if hip_version else f"cuda {torch_info['cuda_version']}"))])
+        report.extend([("torch cuda version", torch.version.cuda), ("torch hip version", hip_version),
+                       ("nvcc version", (None if hip_version else nvcc_version())),
+                       ("deepspeed wheel compiled w.", f"torch {torch_info['version']}, " +
+                        (f"hip {torch_info['hip_version']}" if hip_version else f"cuda {torch_info['cuda_version']}"))
+                       ])
     else:
-        report.extend([("deepspeed wheel compiled w.",
-                        f"torch {torch_info['version']} ")])
+        report.extend([("deepspeed wheel compiled w.", f"torch {torch_info['version']} ")])
 
     print("DeepSpeed general environment info:")
     for name, value in report:
@@ -116,15 +102,10 @@ def debug_report():
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--hide_operator_status',
-        action='store_true',
-        help=
-        'Suppress display of installation and compatibility statuses of DeepSpeed operators. '
-    )
-    parser.add_argument('--hide_errors_and_warnings',
+    parser.add_argument('--hide_operator_status',
                         action='store_true',
-                        help='Suppress warning and error messages.')
+                        help='Suppress display of installation and compatibility statuses of DeepSpeed operators. ')
+    parser.add_argument('--hide_errors_and_warnings', action='store_true', help='Suppress warning and error messages.')
     args = parser.parse_args()
     return args
 
@@ -137,8 +118,7 @@ def main(hide_operator_status=False, hide_errors_and_warnings=False):
 
 def cli_main():
     args = parse_arguments()
-    main(hide_operator_status=args.hide_operator_status,
-         hide_errors_and_warnings=args.hide_errors_and_warnings)
+    main(hide_operator_status=args.hide_operator_status, hide_errors_and_warnings=args.hide_errors_and_warnings)
 
 
 if __name__ == "__main__":
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index dfb8864fe6874d7c10d25a1f998cbbad9806d5d3..6ca0996bb592fe107cc8144a7353016192267f01 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 try:
     #  This is populated by setup.py
diff --git a/deepspeed/inference/__init__.py b/deepspeed/inference/__init__.py
index 449e6651e5d414a936fdeeef5055d14809a06dcd..0fc748f4e167c8be8c82619c3d78bfc339ccd76f 100644
--- a/deepspeed/inference/__init__.py
+++ b/deepspeed/inference/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .engine import InferenceEngine
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 61298db3fbd470c625fe7eb04ba8a5e7a6e4323e..70a67c062ad264ca398bb45875ca51bb13141e77 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
@@ -12,10 +15,12 @@ from enum import Enum
 class DtypeEnum(Enum):
     # The torch dtype must always be the first value (so we return torch.dtype)
     fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
-    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
     fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
     int8 = torch.int8, "torch.int8", "int8"
 
+    # bf16 not supported
+    # bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
+
     # Copied from https://stackoverflow.com/a/43210118
     # Allows us to use multiple values for each Enum index and returns first
     # listed value when Enum is called
@@ -192,6 +197,11 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     This can be passed through the json config too.
     """
 
+    set_empty_params: bool = False
+    """
+    specifying whether the inference-module is created with empty or real Tensor
+    """
+
     save_mp_checkpoint_path: str = None
     """
     The path for which we want to save the loaded model with a checkpoint. This
@@ -222,9 +232,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     replace_method: str = Field(
         "auto",
         deprecated=True,
-        deprecated_msg=
-        "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
-    )
+        deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference")
 
     injection_policy: Dict = Field(None, alias="injection_dict")
     """
@@ -235,9 +243,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     injection_policy_tuple: tuple = None
     """ TODO: Add docs """
 
-    config: Dict = Field(
-        None,
-        alias="args")  # todo: really no need for this field if we can refactor
+    config: Dict = Field(None, alias="args")  # todo: really no need for this field if we can refactor
 
     max_out_tokens: int = Field(1024, alias="max_tokens")
     """
@@ -246,6 +252,16 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     to the required token-length required for your use-case.
     """
 
+    min_out_tokens: int = Field(1, alias="min_tokens")
+    """
+    This argument communicates to the runtime the minimum number of tokens you
+    expect you will need to generate. This will cause the runtime to error
+    if it unable to provide this and provide context on the memory pressure
+    rather than seg-faulting or providing corrupted output.
+    """
+
+    transposed_mode: bool = Field(False, alias="transposed_mode")
+
     mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
     """
     Desired model parallel size, default is 1 meaning no model parallelism.
@@ -254,18 +270,10 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     """
     mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
     ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
-    ep_group: object = Field(None,
-                             alias="expert_group",
-                             deprecated=True,
-                             new_param="moe.ep_group")
-    ep_mp_group: object = Field(None,
-                                alias="expert_mp_group",
-                                deprecated=True,
-                                new_param="moe.ep_mp_group")
+    ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group")
+    ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group")
     moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
-    moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
-                                  deprecated=True,
-                                  new_param="moe.type")
+    moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type")
 
     @validator("moe")
     def moe_backward_compat(cls, field_value, values):
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 123a028ce3a966e48353ed05ceae084c574724ce..de7ca5a711976d3f9718524c81c71234c28edbbe 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 import time
 import os
@@ -32,6 +34,58 @@ from torch import nn
 INFERENCE_MODEL_TIMER = "model-forward-inference"
 
 
+def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`torch.Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
+            dtype of the output tensor
+    """
+    import math
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2**math.floor(math.log2(num_heads))
+    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+                        device=attention_mask.device,
+                        dtype=torch.float32)
+    powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = torch.tensor(2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+                                  device=attention_mask.device,
+                                  dtype=torch.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32)
+        slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    if dist.is_initialized():
+        num_heads_per_rank = int(num_heads / dist.get_world_size())
+        offset = dist.get_rank() * num_heads_per_rank
+        alibi = alibi.view(batch_size, num_heads, 1, seq_length)
+        alibi = alibi[:, offset:num_heads_per_rank + offset, :, :]
+        return alibi.reshape(batch_size * num_heads_per_rank, 1, seq_length).to(dtype)
+    else:
+        return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
+
+
 class InferenceEngine(Module):
     inference_mp_group = None
     inference_ep_group = None
@@ -80,13 +134,18 @@ class InferenceEngine(Module):
         self.checkpoint_engine = TorchCheckpointEngine()
         quantization_setting = None
         self._init_quantization_setting(
-            quantization_setting
-        )  # todo: update with the new quant config for weight quant
+            quantization_setting)  # todo: update with the new quant config for weight quant
         self.model_profile_enabled = False
         self._model_times = []
 
-        # This is a hack to remove the prepare_mask function on HF side for BLOOM architecture
-        self.remove_mask_prepare_for_bloom()
+        if not self.injection_dict and config.replace_with_kernel_inject:
+            # This is a hack to remove the prepare_mask function on HF side for BLOOM architecture
+            self.remove_mask_prepare_for_bloom()
+
+        if self.injection_dict or not config.replace_with_kernel_inject:
+            # This is a hack to redefine the alibi func due to TP
+            if config.tensor_parallel.tp_size > 1:
+                self.build_alibi_tensor()
 
         if get_accelerator().device_name() == 'cuda' and config.enable_cuda_graph:
             assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
@@ -100,8 +159,7 @@ class InferenceEngine(Module):
             self._convert_to_dtype(config)
 
         if self.mpu:
-            config.tensor_parallel.tp_size = dist.get_world_size(
-                group=self.mpu.get_model_parallel_group())
+            config.tensor_parallel.tp_size = dist.get_world_size(group=self.mpu.get_model_parallel_group())
             self.mp_group = self.mpu.get_model_parallel_group()
         elif config.tensor_parallel.tp_size > 1:
             self._create_model_parallel_group(config)
@@ -149,8 +207,7 @@ class InferenceEngine(Module):
         self.module.to(device)
 
         if config.tensor_parallel.tp_size > 1:
-            _rng_state = get_accelerator().get_rng_state().to(
-                get_accelerator().current_device_name())
+            _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
             dist.broadcast(_rng_state, 0)
             get_accelerator().set_rng_state(_rng_state.cpu())
 
@@ -172,15 +229,18 @@ class InferenceEngine(Module):
     # todo: remove this once all the config dicts are centralized from top level pydantic config
     def _get_model_config_generate(self, config):
         # this is being passed to replace_transformer_layer(config=self.user_model_config_dict)
-        self.config = getattr(self.module,
-                              'config',
-                              None) if config.config is None else config.config
+        self.config = getattr(self.module, 'config', None) if config.config is None else config.config
 
     def remove_mask_prepare_for_bloom(self):
         if hasattr(self.module, 'transformer'):
             if hasattr(self.module.transformer, '_prepare_attn_mask'):
                 self.module.transformer._prepare_attn_mask = lambda attention_mask, *args, **kwargs: attention_mask
 
+    def build_alibi_tensor(self):
+        if hasattr(self.module, 'transformer'):
+            if hasattr(self.module.transformer, 'build_alibi_tensor'):
+                self.module.transformer.build_alibi_tensor = build_bloom_alibi_tensor
+
     def _pre_forward_hook(self, module, *inputs, **kwargs):
         if self.use_cuda_events:
             self.timers(INFERENCE_MODEL_TIMER).start()
@@ -223,8 +283,7 @@ class InferenceEngine(Module):
             num_ep_groups = dist.get_world_size() // moe_ep_size
             for i in range(num_ep_groups):
                 ep_cnt = i * moe_ep_size
-                size = dist.get_world_size(
-                ) if moe_ep_size > dist.get_world_size() else moe_ep_size
+                size = dist.get_world_size() if moe_ep_size > dist.get_world_size() else moe_ep_size
                 ranks = list(range(ep_cnt, ep_cnt + size))
                 _ep_group = dist.new_group(ranks)
                 if dist.get_rank() in ranks:
@@ -234,9 +293,7 @@ class InferenceEngine(Module):
                 num_expert_mp_groups = dist.get_world_size() // num_ep_groups
                 expert_mp_size = dist.get_world_size() // moe_ep_size
                 for i in range(num_expert_mp_groups):
-                    expert_mp_comm_ranks = [
-                        i + nr * moe_ep_size for nr in range(expert_mp_size)
-                    ]
+                    expert_mp_comm_ranks = [i + nr * moe_ep_size for nr in range(expert_mp_size)]
                     _expert_mp_group = dist.new_group(expert_mp_comm_ranks)
                     if dist.get_rank() in expert_mp_comm_ranks:
                         self.expert_mp_group.update({moe_ep_size: _expert_mp_group})
@@ -253,65 +310,48 @@ class InferenceEngine(Module):
         log_dist(
             f"quantize_bits = {self.quantize_bits} "
             f"mlp_extra_grouping = {self.mlp_extra_grouping}, "
-            f"quantize_groups = {self.quantize_groups}",
-            [0])
+            f"quantize_groups = {self.quantize_groups}", [0])
 
     # TODO: remove this function and add this functionality to pydantic config checking
     def _validate_args(self, mpu, replace_with_kernel_inject):
         # TODO: to support SD pipeline we need to avoid this check for now
         if replace_with_kernel_inject and not isinstance(self.module, Module):
             raise ValueError(f"model must be a torch.nn.Module, got {type(self.module)}")
-        if not isinstance(self._config.tensor_parallel.tp_size,
-                          int) or self._config.tensor_parallel.tp_size < 1:
-            raise ValueError(
-                f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}"
-            )
+        if not isinstance(self._config.tensor_parallel.tp_size, int) or self._config.tensor_parallel.tp_size < 1:
+            raise ValueError(f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}")
 
         if mpu:
             methods = ["get_model_parallel_group", "get_data_parallel_group"]
             for method in methods:
                 if not hasattr(mpu, method):
                     raise ValueError(f"mpu is missing {method}")
-        if self._config.checkpoint is not None and not isinstance(
-                self._config.checkpoint,
-            (str,
-             dict)):
-            raise ValueError(
-                f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}"
-            )
+        if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
+            raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
 
         supported_dtypes = [None, torch.half, torch.int8, torch.float]
         if self._config.dtype not in supported_dtypes:
-            raise ValueError(
-                f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
+            raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
 
         if self.injection_dict is not None and not isinstance(self.injection_dict, dict):
-            raise ValueError(
-                f"injection_dict must be None or a dict, got: {self.injection_dict}")
+            raise ValueError(f"injection_dict must be None or a dict, got: {self.injection_dict}")
 
     def load_model_with_checkpoint(self, r_module):
         self.mp_replace = ReplaceWithTensorSlicing(
-            mp_group=self.mp_group,
-            mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+            mp_group=self.mp_group, mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
         error_msgs = []
 
         def load(module, state_dict, prefix):
             args = (state_dict, prefix, {}, True, [], [], error_msgs)
             if hasattr(module, 'weight'):
                 if 'query_key_value' in prefix:
-                    module.weight = self.mp_replace.qkv_copy(
-                        module.weight.data,
-                        state_dict[prefix + 'weight'])
+                    module.weight = self.mp_replace.qkv_copy(module.weight.data, state_dict[prefix + 'weight'])
                 else:
-                    module.weight = self.mp_replace.copy(module.weight.data,
-                                                         state_dict[prefix + 'weight'])
+                    module.weight = self.mp_replace.copy(module.weight.data, state_dict[prefix + 'weight'])
             else:
-                module.norm.weight = self.mp_replace.copy(module.norm.weight.data,
-                                                          state_dict[prefix + 'weight'])
+                module.norm.weight = self.mp_replace.copy(module.norm.weight.data, state_dict[prefix + 'weight'])
             if prefix + 'bias' in self.key_list:
                 if hasattr(module, 'norm'):
-                    module.norm.bias = self.mp_replace.copy(module.norm.bias,
-                                                            state_dict[prefix + 'bias'])
+                    module.norm.bias = self.mp_replace.copy(module.norm.bias, state_dict[prefix + 'bias'])
                 else:
                     data = state_dict[prefix + 'bias']
                     data = data.to(get_accelerator().current_device_name())
@@ -331,45 +371,32 @@ class InferenceEngine(Module):
                     checking_key = prefix + name + '.'
                     if not any(checking_key in item for item in self.key_list):
                         continue
-                    if len(list(child.parameters())) > 0 and list(
-                            child.parameters())[0].numel() == 0:
+                    if len(list(child.parameters())) > 0 and list(child.parameters())[0].numel() == 0:
                         if len(child.weight.ds_shape) == 1:
-                            child = Normalize(dim=child.weight.ds_shape[-1],
-                                              dtype=child.weight.dtype,
-                                              eps=child.eps)
+                            child = Normalize(dim=child.weight.ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                             setattr(module, name, child)
                     load(child, self.sd, prefix + name + '.')
                 else:
-                    load_module_recursive(child,
-                                          prefix if level == 0 else prefix + name + '.',
-                                          level + 1)
+                    load_module_recursive(child, prefix if level == 0 else prefix + name + '.', level + 1)
 
         load_module_recursive(r_module)
 
     def _apply_injection_policy(self, config, client_module=None):
         # client_module is only passed when using the injection_dict method.
         checkpoint_dir = config.checkpoint
-        checkpoint = SDLoaderFactory.get_sd_loader_json(
-            checkpoint_dir,
-            self.checkpoint_engine) if checkpoint_dir is not None else None
+        checkpoint = SDLoaderFactory.get_sd_loader_json(checkpoint_dir,
+                                                        self.checkpoint_engine) if checkpoint_dir is not None else None
 
         generic_injection(self.module,
-                          fp16=(config.dtype == torch.half)
-                          or (config.dtype == torch.int8),
+                          fp16=(config.dtype == torch.half) or (config.dtype == torch.int8),
                           enable_cuda_graph=config.enable_cuda_graph)
 
         if isinstance(self.module, torch.nn.Module):
             # config is our DeepSpeedInferenceConfig and self.config is the HF model config
-            replace_transformer_layer(client_module,
-                                      self.module,
-                                      checkpoint,
-                                      config,
-                                      self.config)
+            replace_transformer_layer(client_module, self.module, checkpoint, config, self.config)
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
-        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
-                                                tag,
-                                                mp_placeholder="*")
+        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*")
         import glob
 
         ckpt_files = glob.glob(ckpt_file_pattern)
@@ -392,8 +419,7 @@ class InferenceEngine(Module):
     def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         is_pipe_parallel = isinstance(self.module, PipelineModule)
         if is_pipe_parallel:
-            raise RuntimeError(
-                'pipeline parallelism is currently not supported in inference.')
+            raise RuntimeError('pipeline parallelism is currently not supported in inference.')
         if not isinstance(load_dir, dict) and os.path.isdir(load_dir):
             if tag is None:
                 latest_path = os.path.join(load_dir, "latest")
@@ -404,8 +430,7 @@ class InferenceEngine(Module):
             ckpt_list = self._get_all_ckpt_names(load_dir, tag)
             sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine)
         else:
-            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir,
-                                                           self.checkpoint_engine)
+            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir, self.checkpoint_engine)
 
         if type(sd_loader) is list:
             self.sd = torch.load(sd_loader[0], map_location='cpu')
@@ -416,19 +441,18 @@ class InferenceEngine(Module):
             for i in range(1, len(sd_loader)):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
-                self.sd = torch.load(sd_loader[i],
-                                     map_location=get_accelerator().device_name())
+                self.sd = torch.load(sd_loader[i], map_location=get_accelerator().device_name())
                 self.key_list = list(self.sd.keys())
                 self.load_model_with_checkpoint(self.module)
         else:
             mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
 
             load_path, checkpoint, quantize_config = sd_loader.load(self._config.tensor_parallel.tp_size,
-                                                    mp_rank,
-                                                    is_pipe_parallel=is_pipe_parallel,
-                                                    quantize=(self._config.dtype is torch.int8),
-                                                    quantize_groups=self.quantize_groups,
-                                                    mlp_extra_grouping=self.mlp_extra_grouping)
+                                                                    mp_rank,
+                                                                    is_pipe_parallel=is_pipe_parallel,
+                                                                    quantize=(self._config.dtype is torch.int8),
+                                                                    quantize_groups=self.quantize_groups,
+                                                                    mlp_extra_grouping=self.mlp_extra_grouping)
 
             self.quantization_scales, self.quantize_merge_count = quantize_config
 
@@ -438,21 +462,20 @@ class InferenceEngine(Module):
                 old_moe_load = False
                 if not isinstance(checkpoint['num_experts'], list):
                     old_moe_load = True
-                DeepSpeedEngine.load_moe_state_dict(
-                    load_dir,
-                    tag,
-                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                    old_moe_load=old_moe_load,
-                    model=self.module,
-                    mpu=self.mpu,
-                    checkpoint_engine=self.checkpoint_engine)
-
-            self.module.load_state_dict(
-                state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                strict=load_module_strict)
+                DeepSpeedEngine.load_moe_state_dict(load_dir,
+                                                    tag,
+                                                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                                                    old_moe_load=old_moe_load,
+                                                    model=self.module,
+                                                    mpu=self.mpu,
+                                                    checkpoint_engine=self.checkpoint_engine)
+
+            self.module.load_state_dict(state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                                        strict=load_module_strict)
 
     def _choose_module_key(self, sd):
-        assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
+        assert not ('module' in sd
+                    and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
         assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed"
         if 'module' in sd:
             return 'module'
@@ -465,10 +488,8 @@ class InferenceEngine(Module):
 
         if False:  #config.dtype is torch.int8 and self.quantization_scales is None:
             quantizer = WeightQuantization(mlp_extra_grouping=self.mlp_extra_grouping)
-            model, self.quantization_scales = quantizer.model_quantize(self.module,
-                                                                        self.injection_dict,
-                                                                        self.quantize_bits,
-                                                                        self.quantize_groups)
+            model, self.quantization_scales = quantizer.model_quantize(self.module, self.injection_dict,
+                                                                       self.quantize_bits, self.quantize_groups)
         elif config.dtype == torch.half:
             self.module.half()
         elif config.dtype == torch.bfloat16:
@@ -509,11 +530,10 @@ class InferenceEngine(Module):
         assert self.model_profile_enabled, "model profiling is not enabled"
         model_times = self._model_times
         if self._config.enable_cuda_graph and len(self._model_times) == 0:
-            raise ValueError(
-                "Model times are empty and cuda graph is enabled. If "
-                "this is a GPT-style model this combo is not supported. If this is a "
-                "BERT-style model this is a bug, please report it. "
-                f"Model type is: {type(self.module)}")
+            raise ValueError("Model times are empty and cuda graph is enabled. If "
+                             "this is a GPT-style model this combo is not supported. If this is a "
+                             "BERT-style model this is a bug, please report it. "
+                             f"Model type is: {type(self.module)}")
         self._model_times = []
         return model_times
 
@@ -532,8 +552,7 @@ class InferenceEngine(Module):
             for name in module.__dict__.keys():
                 sub_module = getattr(module, name)
 
-                if self._module_match(sub_module) and hasattr(sub_module,
-                                                              "enable_cuda_graph"):
+                if self._module_match(sub_module) and hasattr(sub_module, "enable_cuda_graph"):
                     sub_module_cuda_graph = True
 
             return sub_module_cuda_graph
@@ -546,13 +565,11 @@ class InferenceEngine(Module):
             **kwargs: variable length keyword arguments
         """
         start = None
-        if self.model_profile_enabled and get_accelerator().device_name(
-        ) == 'cuda' and self._config.enable_cuda_graph:
+        if self.model_profile_enabled and get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph:
             get_accelerator().synchronize()
             start = time.time()
 
-        if get_accelerator().device_name(
-        ) == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph:
+        if get_accelerator().device_name() == 'cuda' and self._config.enable_cuda_graph and not self.local_cuda_graph:
             if self.cuda_graph_created:
                 outputs = self._graph_replay(*inputs, **kwargs)
             else:
@@ -580,9 +597,7 @@ class InferenceEngine(Module):
             num_beams = kwargs["num_beams"]
 
         if num_beams > 1:
-            raise NotImplementedError(
-                "DeepSpeed does not support `num_beams` > 1, if this is important to you please "
-                "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506"
-            )
+            raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please "
+                                      "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506")
 
         return self.module.generate(*inputs, **kwargs)
diff --git a/deepspeed/launcher/__init__.py b/deepspeed/launcher/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/launcher/__init__.py
+++ b/deepspeed/launcher/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index 09dd1bdd1def7e7866104edd21d0d4e9ae19fd8b..838dd66fcb30b6cb8bf12dab39a67f859fa32bb0 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -1,4 +1,7 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 PDSH_LAUNCHER = 'pdsh'
 PDSH_MAX_FAN_OUT = 1024
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index d9e6fda504b4b5cb360997aa526b2381d890156c..d84fe9586b967f9d0b3660782f315f549c2ced23 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -1,4 +1,7 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 DeepSpeed launcher, this is similar to torch's distributed.launch but supports
 additional features such as arbitrary gpu exclusion.
@@ -16,6 +19,7 @@ import base64
 import time
 import signal
 import psutil
+import distutils
 from collections import defaultdict
 from typing import Dict
 from argparse import ArgumentParser, REMAINDER
@@ -52,10 +56,7 @@ def parse_args():
                         help="Master node (rank 0)'s free port that needs to "
                         "be used for communication during distributed "
                         "training")
-    parser.add_argument("--world_info",
-                        default="None",
-                        type=str,
-                        help="world info base64 encoded dictionary")
+    parser.add_argument("--world_info", default="None", type=str, help="world info base64 encoded dictionary")
 
     parser.add_argument("--module",
                         action="store_true",
@@ -68,19 +69,11 @@ def parse_args():
                         help="Skip prepending the training script with "
                         "'python' - just execute it directly.")
 
-    parser.add_argument("--enable_elastic_training",
-                        action="store_true",
-                        help="Enable elastic training support.")
+    parser.add_argument("--enable_elastic_training", action="store_true", help="Enable elastic training support.")
 
-    parser.add_argument("--min_elastic_nodes",
-                        type=int,
-                        default=-1,
-                        help="Min number of nodes in elastic training.")
+    parser.add_argument("--min_elastic_nodes", type=int, default=-1, help="Min number of nodes in elastic training.")
 
-    parser.add_argument("--max_elastic_nodes",
-                        type=int,
-                        default=-1,
-                        help="Max number of nodes in elastic training.")
+    parser.add_argument("--max_elastic_nodes", type=int, default=-1, help="Max number of nodes in elastic training.")
 
     parser.add_argument("--no_local_rank",
                         action="store_true",
@@ -92,11 +85,22 @@ def parse_args():
                         default=0,
                         help="main launching process pid, for internal pid tracking")
 
-    parser.add_argument(
-        "--enable_each_rank_log",
-        default="None",
-        type=str,
-        help="redirect the stdout and stderr from each rank into different log files")
+    parser.add_argument("--enable_each_rank_log",
+                        default="None",
+                        type=str,
+                        help="redirect the stdout and stderr from each rank into different log files")
+
+    parser.add_argument("--bind_cores_to_rank",
+                        action="store_true",
+                        help="Bind each rank to different cores of the host. "
+                        "This improves host efficiency especially for CPU backend")
+
+    parser.add_argument("--bind_core_list",
+                        type=str,
+                        default=None,
+                        help="List of cores to bind to with comma separated list of "
+                        "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
+                        "specified, all cores on system would be used rank binding")
 
     # positional
     parser.add_argument("training_script",
@@ -126,6 +130,89 @@ def terminate_process_tree(pid):
         p.kill()
 
 
+def parse_range(rng):
+    try:
+        value = int(rng)
+        return range(value, value + 1)
+    except ValueError:
+        # value is not a single number
+        parts = rng.split('-')
+        if len(parts) != 2:
+            raise ValueError("Bad range: '%s', range must be either a number or two number separated by dash" %
+                             (rng, ))
+        start = int(parts[0])
+        end = int(parts[1])
+        if start > end:
+            raise ValueError("Bad range: '%s', range end must larger than or equal to start" % (rng, ))
+        return range(start, end + 1)
+
+
+# parse comma and dash separated range list into list
+# i.e. "0,2-4,6" --> [0, 2, 3, 4, 6]
+# rules:
+# 1. Range list numser be comma sepeaated, each item are either a single number,
+#    or a range marked by two numbers (both number are included in the range)
+# 2. Sub ranges must be in ascend order and not overlap with each other
+# 3. No space in the range expression
+def parse_range_list(range_str):
+    number_list = []
+    last = -1
+    range_list = range_str.split(',')
+    for sub_range in range_list:
+        sub_number_list = parse_range(sub_range)
+        if sub_number_list[0] <= last:
+            raise ValueError(
+                "Bad range: '%s', sub ranges must not overlap with each other and should be in ascend order" %
+                (range_str, ))
+        last = sub_number_list[-1]
+        number_list.extend(sub_number_list)
+    return number_list
+
+
+# return a list of list for cores to numa mapping
+# [
+#     [ cores for numa 0 ]
+#     [ cores belong to numa 1 ]
+#     ...
+# ]
+def get_numa_cores():
+    ret = []
+    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    lines = output.split('\n')
+    for line in lines:
+        if line.startswith('available:'):
+            num_numas = int(line.split(' ')[1])
+            break
+    for numa in range(num_numas):
+        for line in lines:
+            if line.startswith(f'node {numa} cpus:'):
+                cores = line.split(' ')[3:]
+                ret.append([int(core) for core in cores])
+    return ret
+
+
+def check_for_numactl_pkg():
+    libs = dict(
+        dpkg=["-l", "numactl", "apt"],
+        pacman=["-Q", "numactl", "pacman"],
+        rpm=["-q", "numactl", "yum"],
+    )
+
+    found = False
+    for pkgmgr, data in libs.items():
+        flag, lib, tool = data
+        path = distutils.spawn.find_executable(pkgmgr)
+        if path is not None:
+            cmd = f"{pkgmgr} {flag} {lib}"
+            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+            if result.wait() == 0:
+                found = True
+            else:
+                print(f"please install the {lib} package with {tool}")
+            break
+    return found
+
+
 def main():
     args = parse_args()
     current_env = os.environ.copy()
@@ -145,9 +232,7 @@ def main():
     local_node = node_list[args.node_rank]
     local_gpu_ids = world_info[local_node]
     num_local_procs = len(local_gpu_ids)
-    logger.info(
-        f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}"
-    )
+    logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}")
 
     global_rank_mapping = defaultdict(list)
     curr_global_rank = 0
@@ -193,8 +278,7 @@ def main():
             lines = file.readlines()
             lines = [line.rstrip() for line in lines]
             for line in lines:
-                if line.startswith('export FC_TASKROLE_NAME') or line.startswith(
-                        'export FC_TASK_INDEX'):
+                if line.startswith('export FC_TASKROLE_NAME') or line.startswith('export FC_TASK_INDEX'):
                     key_val = line.split()[1]
                     key, val = key_val.split('=')
                     current_env[key] = val
@@ -206,17 +290,13 @@ def main():
         if args.enable_each_rank_log != "None":
             # prepare the log path and the file name prefix
             if os.path.isfile(args.enable_each_rank_log):
-                raise ValueError(
-                    f"{args.enable_each_rank_log} should not be a file, it should be a directory."
-                )
+                raise ValueError(f"{args.enable_each_rank_log} should not be a file, it should be a directory.")
             if not os.path.exists(args.enable_each_rank_log):
                 try:
                     os.makedirs(args.enable_each_rank_log)
                 except Exception as e:
                     print(e)
-                    raise ValueError(
-                        f"unable to create directory {args.enable_each_rank_log} for each rank log."
-                    )
+                    raise ValueError(f"unable to create directory {args.enable_each_rank_log} for each rank log.")
             log_name_prefix = time.strftime("%Y%m%d%H%M%S", time.localtime())
 
         for local_rank in range(0, num_local_procs):
@@ -227,8 +307,43 @@ def main():
 
             # spawn the processes
             cmd = []
+            if args.bind_cores_to_rank:
+                check_for_numactl_pkg()
+                if 'KMP_AFFINITY' in os.environ.keys():
+                    raise ValueError("Environment variable KMP_AFFINITY conflicts with numactl "
+                                     "because it interfere with how many CPU cores numactl can set. "
+                                     "Unset KMP_AFFINITY before launching deepspeed.\n\n"
+                                     "\t$ unset KMP_AFFINITY\n"
+                                     "\t$ deepspeed <deepspeed command parameters>")
+                if args.bind_core_list != None:
+                    core_list = parse_range_list(args.bind_core_list)
+                    total_cores = len(core_list)
+                else:
+                    total_cores = psutil.cpu_count(logical=False)
+                    core_list = range(total_cores)
+                cores_per_rank = total_cores // num_local_procs
+                assert cores_per_rank >= 1, "At least one core needs to be assigned to each rank"
+                core_list_for_rank = core_list[cores_per_rank * local_rank:cores_per_rank * (local_rank + 1)]
+                current_env["OMP_NUM_THREADS"] = f"{cores_per_rank}"
+                cmd.append("numactl")
+
+                # check if all cores belong to same numa, if true, bind process to that numa domain with -m parameter
+                numa_cores = get_numa_cores()
+                num_numas = len(numa_cores)
+                for i in range(num_numas):
+                    if set(core_list_for_rank) <= set(numa_cores[i]):
+                        cmd.append("-m")
+                        cmd.append(f"{i}")
+                        break
+
+                cmd.append("-C")
+                core_list_str = f"{core_list_for_rank[0]}"
+                for core_id in core_list_for_rank[1:]:
+                    core_list_str = f"{core_list_str},{core_id}"
+                cmd.append(f"{core_list_str}")
             if not args.no_python:
-                cmd = [sys.executable, "-u"]
+                cmd.append(sys.executable)
+                cmd.append("-u")
                 if args.module:
                     cmd.append("-m")
             else:
@@ -242,13 +357,9 @@ def main():
             cmd += args.training_script_args
 
             if args.enable_each_rank_log != "None":
-                log_file = os.path.join(args.enable_each_rank_log,
-                                        f"{log_name_prefix}_rank{dist_rank}.log")
+                log_file = os.path.join(args.enable_each_rank_log, f"{log_name_prefix}_rank{dist_rank}.log")
                 log_fd = open(log_file, 'w')
-                process = subprocess.Popen(cmd,
-                                           env=current_env,
-                                           stdout=log_fd,
-                                           stderr=log_fd)
+                process = subprocess.Popen(cmd, env=current_env, stdout=log_fd, stderr=log_fd)
             else:
                 process = subprocess.Popen(cmd, env=current_env)
 
@@ -264,7 +375,7 @@ def main():
             args.min_elastic_nodes = 1
         if args.max_elastic_nodes == -1:
             args.max_elastic_nodes = args.nnodes
-        assert args.max_elastic_nodes > 0 and  args.min_elastic_nodes > 0 , "Max and Min nodes should be positive"
+        assert args.max_elastic_nodes > 0 and args.min_elastic_nodes > 0, "Max and Min nodes should be positive"
 
         current_env["NCCL_ASYNC_ERROR_HANDLING"] = str(1)
 
@@ -287,8 +398,7 @@ def main():
 
         # Creating config for rendezvous class
         rdzv_parameters = RendezvousParameters(backend='c10d',
-                                               endpoint=args.master_addr + ":" +
-                                               str(args.master_port),
+                                               endpoint=args.master_addr + ":" + str(args.master_port),
                                                run_id=run_id,
                                                min_nodes=args.min_elastic_nodes,
                                                max_nodes=args.max_elastic_nodes,
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 6027d1076e80ac5e9916008153a756857a94523b..86b58ea6f9b36e0afef3ab28f740cb12a0af76df 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import sys
@@ -13,6 +16,7 @@ from .constants import PDSH_MAX_FAN_OUT, MVAPICH_TMP_HOSTFILE
 
 
 class MultiNodeRunner(ABC):
+
     def __init__(self, args, world_info_base64):
         self.args = args
         self.validate_args()
@@ -45,6 +49,7 @@ class MultiNodeRunner(ABC):
 
 
 class PDSHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64):
         super().__init__(args, world_info_base64)
 
@@ -56,9 +61,7 @@ class PDSHRunner(MultiNodeRunner):
         return "pdsh"
 
     def parse_user_args(self):
-        return list(
-            map(lambda x: x if x.startswith("-") else f"'{x}'",
-                self.args.user_args))
+        return list(map(lambda x: x if x.startswith("-") else f"'{x}'", self.args.user_args))
 
     def get_cmd(self, environment, active_resources):
         environment['PDSH_RCMD_TYPE'] = 'ssh'
@@ -68,14 +71,8 @@ class PDSHRunner(MultiNodeRunner):
 
         # PDSH flags for max node fan out and specific hosts to launch on
         # See https://linux.die.net/man/1/pdsh for flag details
-        pdsh_cmd_args = [
-            'pdsh',
-            '-S',
-            '-f',
-            str(PDSH_MAX_FAN_OUT),
-            '-w',
-            active_workers
-        ] + split(self.args.launcher_args)
+        pdsh_cmd_args = ['pdsh', '-S', '-f', str(PDSH_MAX_FAN_OUT), '-w', active_workers] + split(
+            self.args.launcher_args)
 
         exports = ""
         for key, val in self.exports.items():
@@ -84,15 +81,8 @@ class PDSHRunner(MultiNodeRunner):
         # https://linux.die.net/man/1/pdsh
         # %n will be replaced by pdsh command
         deepspeed_launch = [
-            exports,
-            f"cd {os.path.abspath('.')};",
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.launcher.launch",
-            f'--world_info={self.world_info_base64}',
-            "--node_rank=%n",
-            f"--master_addr={self.args.master_addr}",
+            exports, f"cd {os.path.abspath('.')};", sys.executable, "-u", "-m", "deepspeed.launcher.launch",
+            f'--world_info={self.world_info_base64}', "--node_rank=%n", f"--master_addr={self.args.master_addr}",
             f"--master_port={self.args.master_port}"
         ]
         if self.args.no_python:
@@ -111,11 +101,11 @@ class PDSHRunner(MultiNodeRunner):
         cmd_to_search = [i + "\\" for i in deepspeed_launch[2:6]]
 
         kill_command = pdsh_cmd_args + ["pkill -f ", " ".join(cmd_to_search)[:-2]]
-        return pdsh_cmd_args + deepspeed_launch + [self.user_script
-                                                   ] + self.user_arguments, kill_command
+        return pdsh_cmd_args + deepspeed_launch + [self.user_script] + self.user_arguments, kill_command
 
 
 class OpenMPIRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -133,11 +123,9 @@ class OpenMPIRunner(MultiNodeRunner):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         total_process_count = sum(self.resource_pool.values())
@@ -166,11 +154,11 @@ class OpenMPIRunner(MultiNodeRunner):
             if self.args.module:
                 python_exec.append("-m")
 
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
 
 
 class MPICHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -187,17 +175,22 @@ class MPICHRunner(MultiNodeRunner):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
 
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         devices_per_node = self.resource_pool.values()
         total_process_count = sum(devices_per_node)
         process_per_node = list(devices_per_node)[0]
+        hosts = ""
+
+        for i, host in enumerate(self.resource_pool.keys()):
+            if i == 0:
+                hosts = f"{host}"
+            else:
+                hosts += f",{host}"
 
         mpirun_cmd = [
             'mpirun',
@@ -205,21 +198,24 @@ class MPICHRunner(MultiNodeRunner):
             f'{total_process_count}',
             '-ppn',
             f'{process_per_node}',
+            '-hosts',
+            f'{hosts}',
         ] + split(self.args.launcher_args)
         export_cmd = []
 
         for k, v in self.exports.items():
-            export_cmd += ['-x', "{}={}".format(k, v)]
+            export_cmd += ['-genv', "{}={}".format(k, v)]
 
         python_exec = []
         if not self.args.no_python:
             python_exec = [sys.executable, "-u"]
             if self.args.module:
                 python_exec.append("-m")
-        return mpirun_cmd + python_exec + [self.user_script] + self.user_arguments
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
 
 
 class SlurmRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -232,7 +228,8 @@ class SlurmRunner(MultiNodeRunner):
         return 'slurm'
 
     def get_cmd(self, environment, active_resources):
-        assert not getattr(self.args, 'detect_nvlink_pairs', False), "slurm backend does not support remapping visible devices"
+        assert not getattr(self.args, 'detect_nvlink_pairs',
+                           False), "slurm backend does not support remapping visible devices"
         total_process_count = sum(self.resource_pool.values())
         srun_cmd = [
             'srun',
@@ -261,12 +258,12 @@ class SlurmRunner(MultiNodeRunner):
             exports += f",{key}={val}"
 
         python_exec = [sys.executable, "-u"]
-        command = srun_cmd + [exports] + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+        command = srun_cmd + [exports] + python_exec + [self.user_script] + self.user_arguments
         return command
 
 
 class MVAPICHRunner(MultiNodeRunner):
+
     def __init__(self, args, world_info_base64, resource_pool):
         super().__init__(args, world_info_base64)
         self.resource_pool = resource_pool
@@ -303,9 +300,7 @@ class MVAPICHRunner(MultiNodeRunner):
             if "MVAPICH2-GDR" in mpiname_results:
                 exists = True
             else:
-                warnings.warn(
-                    f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}"
-                )
+                warnings.warn(f"Expected MVAPICH2-GDR as return for mpiname but received {mpiname_results}")
         return exists
 
     @property
@@ -316,11 +311,9 @@ class MVAPICHRunner(MultiNodeRunner):
         super().validate_args()
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
-            raise ValueError(
-                f"{self.name} backend does not support worker include/exclusion")
+            raise ValueError(f"{self.name} backend does not support worker include/exclusion")
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
-            raise ValueError(
-                f"{self.name} backend does not support limiting num nodes/gpus")
+            raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
     def get_cmd(self, environment, active_resources):
         devices_per_node = self.resource_pool.values()
@@ -353,5 +346,4 @@ class MVAPICHRunner(MultiNodeRunner):
             if self.args.module:
                 python_exec.append("-m")
 
-        return mpirun_cmd + export_cmd + python_exec + [self.user_script
-                                                        ] + self.user_arguments
+        return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 044f6ff03365628be3fa8f774d33462057fa9824..a28b4d1535161510e1d8d4faefcc4392da7f2186 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -1,4 +1,7 @@
-# Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 DeepSpeed runner is the main front-end to launching multi-worker
 training jobs with DeepSpeed. By default this uses pdsh to parallel
@@ -36,9 +39,9 @@ PDSH_MAX_FAN_OUT = 1024
 
 
 def parse_args(args=None):
-    parser = argparse.ArgumentParser(
-        description="DeepSpeed runner to help launch distributed "
-        "multi-node/multi-gpu training jobs.")
+    parser = argparse.ArgumentParser(description="DeepSpeed runner to help launch distributed "
+                                     "multi-node/multi-gpu training jobs.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     parser.add_argument("-H",
                         "--hostfile",
@@ -109,12 +112,11 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
-    parser.add_argument(
-        "--launcher",
-        default=PDSH_LAUNCHER,
-        type=str,
-        help="(optional) choose launcher backend for multi-node "
-        "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
+    parser.add_argument("--launcher",
+                        default=PDSH_LAUNCHER,
+                        type=str,
+                        help="(optional) choose launcher backend for multi-node "
+                        "training. Options currently include PDSH, OpenMPI, MVAPICH, SLURM, MPICH.")
 
     parser.add_argument("--launcher_args",
                         default="",
@@ -147,37 +149,40 @@ def parse_args(args=None):
                         help="Force multi-node launcher mode, helps in cases where user "
                         "wants to launch on single remote node.")
 
-    parser.add_argument(
-        "--save_pid",
-        action="store_true",
-        help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
-        "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
-        "Useful when launching deepspeed processes programmatically.")
-
-    parser.add_argument(
-        "--enable_each_rank_log",
-        default="None",
-        type=str,
-        help="redirect the stdout and stderr from each rank into different log files")
-
-    parser.add_argument(
-        "--autotuning",
-        default="",
-        choices=["tune",
-                 "run"],
-        type=str,
-        help="Run DeepSpeed autotuner to discover optimal configuration parameters "
-        "before running job.")
+    parser.add_argument("--save_pid",
+                        action="store_true",
+                        help="Save file containing launcher process id (pid) at /tmp/<main-pid>.ds, "
+                        "where <main-pid> is the pid of the first process that invoked `deepspeed`. "
+                        "Useful when launching deepspeed processes programmatically.")
+
+    parser.add_argument("--enable_each_rank_log",
+                        default="None",
+                        type=str,
+                        help="redirect the stdout and stderr from each rank into different log files")
+
+    parser.add_argument("--autotuning",
+                        default="",
+                        choices=["tune", "run"],
+                        type=str,
+                        help="Run DeepSpeed autotuner to discover optimal configuration parameters "
+                        "before running job.")
 
     parser.add_argument("--elastic_training",
                         action="store_true",
                         help="Enable elastic training support in DeepSpeed.")
 
-    parser.add_argument("user_script",
-                        type=str,
-                        help="User script to launch, followed by any required "
+    parser.add_argument("user_script", type=str, help="User script to launch, followed by any required "
                         "arguments.")
     parser.add_argument('user_args', nargs=argparse.REMAINDER)
+    parser.add_argument("--bind_cores_to_rank",
+                        action="store_true",
+                        help="Bind each rank to different cores of the host")
+    parser.add_argument("--bind_core_list",
+                        type=str,
+                        default=None,
+                        help="List of cores to bind to with comma separated list of "
+                        "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
+                        "specified, all cores on system would be used rank binding")
     return parser.parse_args(args=args)
 
 
@@ -213,21 +218,15 @@ def _parse_hostfile(hostfile_lines):
             num_slots = int(match.group(2))
             if host in resource_pool:
                 logger.error(f"Bad hostfile text: {hostfile_lines}")
-                raise ValueError(
-                    f"Hostfile contains multiple entries for {host}, unable to proceed with launching"
-                )
+                raise ValueError(f"Hostfile contains multiple entries for {host}, unable to proceed with launching")
             resource_pool[host] = num_slots
         else:
             logger.error(f"Bad hostfile text: {hostfile_lines}")
-            raise ValueError(
-                "Hostfile contains a bad entry: {line}, unable to proceed with launching"
-            )
+            raise ValueError("Hostfile contains a bad entry: {line}, unable to proceed with launching")
 
     if len(resource_pool) == 0:
         logger.error(f"Bad hostfile text: {hostfile_lines}")
-        raise ValueError(
-            "Hostfile is empty or not formatted correctly, unable to proceed with launching."
-        )
+        raise ValueError("Hostfile is empty or not formatted correctly, unable to proceed with launching.")
 
     return resource_pool
 
@@ -337,9 +336,7 @@ def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
     for hostname, slots in resource_pool.items():
         active_resources[hostname] = list(range(slots))
 
-    return parse_resource_filter(active_resources,
-                                 include_str=inclusion,
-                                 exclude_str=exclusion)
+    return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
 
 
 def encode_world_info(world_info):
@@ -389,8 +386,7 @@ def main(args=None):
     cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
     if not resource_pool and len(cuda_visible_devices):
         detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
-        if len(args.include) or len(
-                args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
+        if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
             print(
                 f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
             )
@@ -416,20 +412,17 @@ def main(args=None):
     if not multi_node_exec and args.num_nodes > 1:
         raise ValueError("Num nodes is >1 but no extra nodes available via hostfile")
 
-    active_resources = parse_inclusion_exclusion(resource_pool,
-                                                 args.include,
-                                                 args.exclude)
+    active_resources = parse_inclusion_exclusion(resource_pool, args.include, args.exclude)
     env = os.environ.copy()
 
     # validate that passwordless-ssh is workly properly with this hostfile
     if multi_node_exec and not args.no_ssh_check:
         first_host = list(active_resources.keys())[0]
         try:
-            subprocess.check_call(
-                f'ssh -o PasswordAuthentication=no {first_host} hostname',
-                stderr=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,
-                shell=True)
+            subprocess.check_call(f'ssh -o PasswordAuthentication=no {first_host} hostname',
+                                  stderr=subprocess.DEVNULL,
+                                  stdout=subprocess.DEVNULL,
+                                  shell=True)
         except subprocess.CalledProcessError:
             raise RuntimeError(
                 f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
@@ -481,13 +474,8 @@ def main(args=None):
 
     if not multi_node_exec:
         deepspeed_launch = [
-            sys.executable,
-            "-u",
-            "-m",
-            "deepspeed.launcher.launch",
-            f"--world_info={world_info_base64}",
-            f"--master_addr={args.master_addr}",
-            f"--master_port={args.master_port}"
+            sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}",
+            f"--master_addr={args.master_addr}", f"--master_port={args.master_port}"
         ]
         if args.no_python:
             deepspeed_launch.append("--no_python")
@@ -498,12 +486,15 @@ def main(args=None):
         if args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
         if args.enable_each_rank_log:
-            deepspeed_launch.append(
-                f"--enable_each_rank_log={args.enable_each_rank_log}")
+            deepspeed_launch.append(f"--enable_each_rank_log={args.enable_each_rank_log}")
         if args.elastic_training:
             deepspeed_launch.append("--enable_elastic_training")
             deepspeed_launch.append(f"--max_elastic_nodes={args.max_elastic_nodes}")
             deepspeed_launch.append(f"--min_elastic_nodes={args.min_elastic_nodes}")
+        if args.bind_cores_to_rank:
+            deepspeed_launch.append("--bind_cores_to_rank")
+        if args.bind_core_list != None:
+            deepspeed_launch.append(f"--bind_core_list={args.bind_core_list}")
         cmd = deepspeed_launch + [args.user_script] + args.user_args
     else:
         args.launcher = args.launcher.lower()
diff --git a/deepspeed/model_implementations/__init__.py b/deepspeed/model_implementations/__init__.py
index 77d1c5b1e7cb7245c2bd7f55f2cc7548996e276e..ad95c58f76090188b5d9c73e1e3df4eb2b2c678f 100644
--- a/deepspeed/model_implementations/__init__.py
+++ b/deepspeed/model_implementations/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .transformers.ds_transformer import DeepSpeedTransformerInference
 from .transformers.clip_encoder import DSClipEncoder
diff --git a/deepspeed/model_implementations/diffusers/__init__.py b/deepspeed/model_implementations/diffusers/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/model_implementations/diffusers/__init__.py
+++ b/deepspeed/model_implementations/diffusers/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/diffusers/unet.py b/deepspeed/model_implementations/diffusers/unet.py
index 8ac97945ceb2ffd0eb456172efb07c4b886af876..6086d9fb98627b962f780f4b0c2d5e73e667597a 100644
--- a/deepspeed/model_implementations/diffusers/unet.py
+++ b/deepspeed/model_implementations/diffusers/unet.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSUNet(CUDAGraph, torch.nn.Module):
+
     def __init__(self, unet, enable_cuda_graph=True):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         self.unet = unet
@@ -59,5 +62,12 @@ class DSUNet(CUDAGraph, torch.nn.Module):
 
         self.cuda_graph_created = True
 
-    def _forward(self, sample, timestamp, encoder_hidden_states, return_dict=True):
-        return self.unet(sample, timestamp, encoder_hidden_states, return_dict)
+    def _forward(self, sample, timestamp, encoder_hidden_states, return_dict=True, cross_attention_kwargs=None):
+        if cross_attention_kwargs:
+            return self.unet(sample,
+                             timestamp,
+                             encoder_hidden_states,
+                             return_dict,
+                             cross_attention_kwargs=cross_attention_kwargs)
+        else:
+            return self.unet(sample, timestamp, encoder_hidden_states, return_dict)
diff --git a/deepspeed/model_implementations/diffusers/vae.py b/deepspeed/model_implementations/diffusers/vae.py
index 8f8d13cb2df2e73f1ab044755079225857fb5e52..445a9843921ad486750c801584c81f8930335d65 100644
--- a/deepspeed/model_implementations/diffusers/vae.py
+++ b/deepspeed/model_implementations/diffusers/vae.py
@@ -1,14 +1,18 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSVAE(CUDAGraph, torch.nn.Module):
+
     def __init__(self, vae, enable_cuda_graph=True):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         self.vae = vae
+        self.config = vae.config
         self.device = self.vae.device
         self.dtype = self.vae.dtype
         self.vae.requires_grad_(requires_grad=False)
@@ -44,8 +48,7 @@ class DSVAE(CUDAGraph, torch.nn.Module):
         self.static_decoder_kwargs = kwargs
 
         with torch.cuda.graph(self._decoder_cuda_graph):
-            self.static_decoder_output = self._decode(*self.static_decoder_inputs,
-                                                      **self.static_decoder_kwargs)
+            self.static_decoder_output = self._decode(*self.static_decoder_inputs, **self.static_decoder_kwargs)
 
         self.decoder_cuda_graph_created = True
 
@@ -88,8 +91,7 @@ class DSVAE(CUDAGraph, torch.nn.Module):
         self.static_encoder_kwargs = kwargs
 
         with torch.cuda.graph(self._encoder_cuda_graph):
-            self.static_encoder_output = self._encode(*self.static_encoder_inputs,
-                                                      **self.static_encoder_kwargs)
+            self.static_encoder_output = self._encode(*self.static_encoder_inputs, **self.static_encoder_kwargs)
 
         self.encoder_cuda_graph_created = True
 
diff --git a/deepspeed/model_implementations/features/__init__.py b/deepspeed/model_implementations/features/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/model_implementations/features/__init__.py
+++ b/deepspeed/model_implementations/features/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/features/cuda_graph.py b/deepspeed/model_implementations/features/cuda_graph.py
index 3224f625c6bc73013170446636e5ed3fe0567e71..fc3a2951665426b5c98112ce307e395993644f8b 100644
--- a/deepspeed/model_implementations/features/cuda_graph.py
+++ b/deepspeed/model_implementations/features/cuda_graph.py
@@ -1,10 +1,13 @@
-'''
-Copyright 2023 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from abc import ABC, abstractmethod
 
 
 class CUDAGraph(ABC):
+
     def __init__(self, enable_cuda_graph=False):
         super().__init__()
         self.enable_cuda_graph = enable_cuda_graph
diff --git a/deepspeed/model_implementations/transformers/__init__.py b/deepspeed/model_implementations/transformers/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/model_implementations/transformers/__init__.py
+++ b/deepspeed/model_implementations/transformers/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/model_implementations/transformers/clip_encoder.py b/deepspeed/model_implementations/transformers/clip_encoder.py
index efa282c4449f4225b563e3650bebf4e19df4c8f5..8d9291896986a42ab02d83012936ef16304cd18c 100644
--- a/deepspeed/model_implementations/transformers/clip_encoder.py
+++ b/deepspeed/model_implementations/transformers/clip_encoder.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from deepspeed.accelerator import get_accelerator
 from ..features.cuda_graph import CUDAGraph
 
 
 class DSClipEncoder(CUDAGraph, torch.nn.Module):
+
     def __init__(self, enc, enable_cuda_graph=False):
         super().__init__(enable_cuda_graph=enable_cuda_graph)
         enc.text_model._build_causal_attention_mask = self._build_causal_attention_mask
@@ -22,11 +25,7 @@ class DSClipEncoder(CUDAGraph, torch.nn.Module):
         self.config = self.enc.config
 
     def _build_causal_attention_mask(self, bsz, seq_len, dtype):
-        mask = torch.empty(bsz,
-                           seq_len,
-                           seq_len,
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype, device=get_accelerator().current_device_name())
         mask.fill_(torch.tensor(torch.finfo(dtype).min))
         mask.triu_(1)
         mask = mask.unsqueeze(1)
@@ -69,9 +68,8 @@ class DSClipEncoder(CUDAGraph, torch.nn.Module):
         self.static_kwargs[self.iter] = kwargs
 
         with torch.cuda.graph(self._cuda_graphs[self.iter]):
-            self.static_output[self.iter] = self._forward(
-                *self.static_inputs[self.iter],
-                **self.static_kwargs[self.iter])
+            self.static_output[self.iter] = self._forward(*self.static_inputs[self.iter],
+                                                          **self.static_kwargs[self.iter])
 
         self.cuda_graph_created[self.iter] = True
 
diff --git a/deepspeed/model_implementations/transformers/ds_base.py b/deepspeed/model_implementations/transformers/ds_base.py
index 9a848eaae1ad18f42d4039331fc4ae12e3380163..b4b113904997fd124aecdfe51529aab6c9e1b25c 100644
--- a/deepspeed/model_implementations/transformers/ds_base.py
+++ b/deepspeed/model_implementations/transformers/ds_base.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch.nn as nn
 
 
 class DeepSpeedTransformerBase(nn.module):
+
     def __init__(self):
         pass
 
diff --git a/deepspeed/model_implementations/transformers/ds_bert.py b/deepspeed/model_implementations/transformers/ds_bert.py
index 3e7a7b77fddf77ba0f41bf36f968746bb5c8749d..13075553ec8b6d818744cc5404ef4db31283ac8e 100644
--- a/deepspeed/model_implementations/transformers/ds_bert.py
+++ b/deepspeed/model_implementations/transformers/ds_bert.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee
 class DeepSpeedBERTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed BERT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ class DeepSpeedBERTInference(DeepSpeedTransformerInference):
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_bloom.py b/deepspeed/model_implementations/transformers/ds_bloom.py
index 386352f28a76de5020b7c1b111dae8e998ae1094..c48c7ed58ae5e2746c513a1acea509bbfd129784 100644
--- a/deepspeed/model_implementations/transformers/ds_bloom.py
+++ b/deepspeed/model_implementations/transformers/ds_bloom.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee
 class DeepSpeedBloomInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed Bloom Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ class DeepSpeedBloomInference(DeepSpeedTransformerInference):
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_gpt.py b/deepspeed/model_implementations/transformers/ds_gpt.py
index 86cc9fdc6715f776551888a6b94cf918b1681406..3c3baed1f6186a89ed2b852f27ad56a5ab00d9f6 100644
--- a/deepspeed/model_implementations/transformers/ds_gpt.py
+++ b/deepspeed/model_implementations/transformers/ds_gpt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee
 class DeepSpeedGPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed GPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ class DeepSpeedGPTInference(DeepSpeedTransformerInference):
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
index aca6b809e5df12f03e2b3a83568611ed277a331d..055ed6d27d7db4cb39cf868d5c189ef517f4a08a 100644
--- a/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
+++ b/deepspeed/model_implementations/transformers/ds_megatron_gpt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee
 class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed Megatron GPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ class DeepSpeedMegatronGPTInference(DeepSpeedTransformerInference):
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_opt.py b/deepspeed/model_implementations/transformers/ds_opt.py
index a5209a30f818bfab912568872070bba046b96d09..7bc5524d71c7155fad0b40eafeca490705f3e324 100644
--- a/deepspeed/model_implementations/transformers/ds_opt.py
+++ b/deepspeed/model_implementations/transformers/ds_opt.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -8,6 +9,7 @@ from deepspeed.model_implementations.transformers.ds_transformer import DeepSpee
 class DeepSpeedOPTInference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed OPT Transformer Layer.
     """
+
     def __init__(self,
                  config,
                  mp_group=None,
@@ -15,9 +17,4 @@ class DeepSpeedOPTInference(DeepSpeedTransformerInference):
                  quantize_groups=1,
                  merge_count=1,
                  mlp_extra_grouping=False):
-        super().__init__(config,
-                         mp_group,
-                         quantize_scales,
-                         quantize_groups,
-                         merge_count,
-                         mlp_extra_grouping)
+        super().__init__(config, mp_group, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping)
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index ee5a9bdf8763a743b8f233c0c48e7e6befade499..6ef838cea7410597280372fd684e5afbee068e88 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
@@ -56,37 +57,26 @@ class DeepSpeedTransformerInference(nn.Module):
             log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
 
         if self.config.bigscience_bloom:
-            self.attention = BloomSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count)
+            self.attention = BloomSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)
         else:
-            self.attention = DeepSpeedSelfAttention(self.config,
-                                                    mp_group,
-                                                    quantize_scales,
-                                                    quantize_groups,
+            self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups,
                                                     merge_count)
-        self.mlp = DeepSpeedMLP(self.config,
-                                mp_group,
-                                quantize_scales,
-                                quantize_groups,
-                                merge_count,
+        self.mlp = DeepSpeedMLP(self.config, mp_group, quantize_scales, quantize_groups, merge_count,
                                 mlp_extra_grouping)
 
-        device = get_accelerator().current_device_name(
-        )  # if config.bigscience_bloom else 'cpu'
-        self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size,
-                                               dtype=data_type,
-                                               device=device),
-                                   requires_grad=False)
-        self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size,
-                                               dtype=data_type,
-                                               device=device),
-                                   requires_grad=False)
+        device = get_accelerator().current_device_name()  # if config.bigscience_bloom else 'cpu'
+        if self.config.set_empty_params:
+            self.norm_w = None
+            self.norm_b = None
+        else:
+            self.norm_w = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
+                                       requires_grad=False)
+            self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
+                                       requires_grad=False)
         self.layer_past = None
         self.allocate_workspace = inference_cuda_module.allocate_workspace_fp32 if (not config.fp16) else \
                                 inference_cuda_module.allocate_workspace_fp16
+        self._alloc_workspace = True
 
     @classmethod
     def reset_cache(cls):
@@ -114,25 +104,25 @@ class DeepSpeedTransformerInference(nn.Module):
             # TODO(arashb): 'layer_head_mask' and 'past_key_value' are only added to satisfy the OPT models API.
             # This needs to be redesigned later!
             layer_head_mask=None,
-            past_key_value=None):
+            past_key_value=None,
+            **kwargs):
 
         if x is not None:
             input = x
+        if "hidden_states" in kwargs:
+            input = kwargs["hidden_states"]
 
-        input_mask = (input_mask if attn_mask is None else
-                      attn_mask) if attention_mask is None else attention_mask
+        input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
 
         # Allocate memory only on first layer forward
-        if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size,
-                                    self.config.heads,
+        if self.config.layer_id == 0 and self._alloc_workspace:
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
                                     input.size()[1],
-                                    input.size()[0],
-                                    DeepSpeedTransformerInference.layer_id,
-                                    self.config.mp_size,
+                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
                                     self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0,
-                                    self.config.max_out_tokens)
+                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                                    self.config.min_out_tokens)
+            self._alloc_workspace = False
 
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
@@ -171,10 +161,7 @@ class DeepSpeedTransformerInference(nn.Module):
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
-                output = inference_cuda_module.layer_norm(output,
-                                                          self.norm_w,
-                                                          self.norm_b,
-                                                          self.config.epsilon)
+                output = inference_cuda_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             output = output.to(input_type)
         if get_present:
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
index aab302864051d55db567dfd5868dbe8874d841ee..4bdabf383b26ee23b286baa4387ff932b80d1614 100755
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
index a2c570f5781c724bca1a92a8c17a3744aad7f718..bf49df9781f56dcbacf8e4ef2888d9c5c437d6e4 100644
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Automatic Tensor Parallelism
 import re
@@ -8,6 +11,7 @@ from .replace_policy import replace_policies
 
 
 class AutoTP():
+
     def in_module_list(module, module_list):
         for item in module_list:
             if type(item).__name__ == type(module).__name__:
@@ -28,18 +32,7 @@ class AutoTP():
         return mlist
 
     def supported(model):
-        unsupported = [
-            'bloom',
-            'codegen',
-            'deberta',
-            'flaubert',
-            'fsmt',
-            'gpt2',
-            'led',
-            'longformer',
-            'xlm',
-            'xlnet'
-        ]
+        unsupported = ['codegen', 'deberta', 'flaubert', 'fsmt', 'gpt2', 'led', 'longformer', 'xlm', 'xlnet']
         model = str(model)
         key = re.search(r": (.*?)Model", model)
         if key is None:
@@ -56,8 +49,7 @@ class AutoTP():
         for key, submodule in module._modules.items():
             if isinstance(submodule, nn.Linear):
                 layer_list = layer_list + [parent + "." + key]
-            elif isinstance(submodule,
-                            nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+            elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
                 layer_list = layer_list + ["ln"]
             else:
                 layer_list = layer_list + AutoTP.get_layers(key, submodule)
@@ -102,9 +94,7 @@ class AutoTP():
             for key, submodule in module._modules.items():
                 if isinstance(submodule, nn.Linear):
                     layer_list = layer_list + ["." + key]
-                elif isinstance(
-                        submodule,
-                        nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+                elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
                     layer_list = layer_list + ["ln"]
                 else:
                     layer_list = layer_list + AutoTP.get_layers(key, submodule)
diff --git a/deepspeed/module_inject/containers/__init__.py b/deepspeed/module_inject/containers/__init__.py
index 6abef517a500d477f3d049606e4365c0b3f4eda1..4655b29b5ba616cb3c04a49d090fdd8f220c901d 100644
--- a/deepspeed/module_inject/containers/__init__.py
+++ b/deepspeed/module_inject/containers/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .bert import DS_BERTContainer, HFBertLayerPolicy
 from .bloom import DS_BloomContainer, BLOOMLayerPolicy, supported_models
diff --git a/deepspeed/module_inject/containers/base.py b/deepspeed/module_inject/containers/base.py
index 45faeb47795199bf1378b9bef918ac66f7b6a6ec..0d2d6fc99ef949fb6c35b5fc965aa7d984e99e76 100644
--- a/deepspeed/module_inject/containers/base.py
+++ b/deepspeed/module_inject/containers/base.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Create a container object to save model-specific tensors using the policy file above.
 from abc import ABC
@@ -15,6 +18,7 @@ class BaseConvolutionContainer(ABC):
 
 
 class BaseTransformerContainer(ABC):
+
     def __init__(self, policy, config, model_config, layer_id, child):
         self.policy = policy
         self.config = config
@@ -30,28 +34,22 @@ class BaseTransformerContainer(ABC):
         self.hidden_size = None
         self.num_attention_heads = None
         self.mp_size = self.config.tensor_parallel.tp_size
-        self.pre_layer_norm = self.policy.pre_attn_norm
+        self.pre_layer_norm = self.model_config.do_layer_norm_before if \
+            hasattr(self.model_config, 'do_layer_norm_before') else self.policy.pre_attn_norm
         self.fp16 = False
         self.attn_linear_layer = self.policy.linear_layer
         self.mlp_linear_layer = self.policy.linear_layer
-        self.layer_norm_eps = self.model_config.layer_norm_eps if \
-            hasattr(self.model_config, 'layer_norm_eps') else (self.model_config.layer_norm_epsilon if \
-            hasattr(self.model_config, 'layer_norm_epsilon') else self.model_config.layernorm_epsilon if \
-            hasattr(self.model_config, 'layernorm_epsilon') else 1.0e-12)
         self.return_tuple = self.config.return_tuple
         self.triangular_masking = True
-        self.local_attention = ((self.model_config.attention_layers[self.layer_id]
-                                 == "local") if hasattr(self.model_config,
-                                                        'attention_layers') else False)
+        self.local_attention = ((self.model_config.attention_layers[self.layer_id] == "local") if hasattr(
+            self.model_config, 'attention_layers') else False)
         self.window_size = getattr(self.model_config, "window_size", 1)
         self.mlp_act_func_type = self.policy.mlp_act_func_type
         self.training_mp_size = self.config.training_mp_size
         self.bigscience_bloom = False
         self.max_out_tokens = self.config.max_out_tokens
-        self.scale_attn_by_inverse_layer_idx = getattr(
-            self.config,
-            "scale_attn_by_inverse_layer_idx",
-            False)
+        self.min_out_tokens = self.config.min_out_tokens
+        self.scale_attn_by_inverse_layer_idx = getattr(self.config, "scale_attn_by_inverse_layer_idx", False)
         self.use_mup = self.policy.use_mup
         self.return_single_tuple = False
         self.rotary_dim = self.model_config.rotary_dim if hasattr(self.model_config, 'rotary_dim') \
@@ -75,6 +73,8 @@ class BaseTransformerContainer(ABC):
         self.input_nw = None
         self.input_nb = None
 
+        self.mp_group = None
+
     def create_ds_model_config(self):
         self.set_hidden_heads(*self.policy.get_hidden_heads())
         assert self.num_attention_heads % self.mp_size == 0,\
@@ -84,11 +84,11 @@ class BaseTransformerContainer(ABC):
         self.ds_model_config = DeepSpeedInferenceConfig(
             hidden_size=self.hidden_size,
             heads=self.num_attention_heads,
-            layer_norm_eps=self.layer_norm_eps,
+            layer_norm_eps=self.layernorm_epsilon,
             fp16=self.fp16,
             pre_layer_norm=self.pre_layer_norm,
             mp_size=self.mp_size,
-            q_int8=self.quantize,
+            q_int8=self.quantize if hasattr(self, 'quantize') else False,
             return_tuple=self.return_tuple,
             triangular_masking=self.triangular_masking,
             local_attention=self.local_attention,
@@ -99,18 +99,24 @@ class BaseTransformerContainer(ABC):
             training_mp_size=self.training_mp_size,
             bigscience_bloom=self.bigscience_bloom,
             max_out_tokens=self.max_out_tokens,
+            min_out_tokens=self.min_out_tokens,
             scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
             use_mup=self.use_mup,
             return_single_tuple=self.return_single_tuple,
-        )
+            set_empty_params=self.config.set_empty_params,
+            transposed_mode=self.config.transposed_mode)
 
         return self.ds_model_config
 
-    def initialize_tensors(self):
+    def initialize_tensors(self, enable_training=False):
         # Set the tensors from policy (user module) to container (DS module)
-        self.set_attention(*self.policy.attention())
+        self.set_attention(*self.policy.attention(enable_training=enable_training))
         self.set_mlp(*self.policy.mlp())
         self.set_layernorm(*self.policy.layernorm())
+        self.set_lora_params(self.policy.get_lora_params())
+        self.q_k_v = self.policy.get_q_k_v()
+        if self.q_k_v is not None:
+            self.set_q_k_v(*self.q_k_v)
 
     def convert_to_required_dtype(self, dtype):
         # Note: converting tensors to fp16 requires that we do it in-place using self.__dict__ and not make a list/dict copy
@@ -138,9 +144,10 @@ class BaseTransformerContainer(ABC):
         self.quantize = quantize
         self.quantizer = quantizer
 
-    def set_hidden_heads(self, hidden_size, num_attention_heads):
+    def set_hidden_heads(self, hidden_size, num_attention_heads, epsilon):
         self.hidden_size = hidden_size
         self.num_attention_heads = num_attention_heads
+        self.layernorm_epsilon = epsilon
 
     def set_attention(self, qkvw, qkvb, dense_w, dense_b):
         self.qkvw = qkvw
@@ -148,6 +155,17 @@ class BaseTransformerContainer(ABC):
         self.dense_w = dense_w
         self.dense_b = dense_b
 
+    def set_lora_params(self, lora_params):
+        self.lora_params = lora_params
+
+    def set_q_k_v(self, qw, qb, kw, kb, vw, vb):
+        self.qw = qw
+        self.qb = qb
+        self.kw = kw
+        self.kb = kb
+        self.vw = vw
+        self.vb = vb
+
     def set_mlp(self, _h4h_w, _h4h_b, _4hh_w, _4hh_b):
         self._h4h_w = _h4h_w
         self._h4h_b = _h4h_b
@@ -168,63 +186,184 @@ class BaseTransformerContainer(ABC):
         self.mlp_quantization()
 
     def attention_quantization(self):
-        self.module.attention.attn_qkvw = self.quantizer.quantize(
-            self.module.attention.attn_qkvw)
-        self.module.attention.attn_ow = self.quantizer.quantize(
-            self.module.attention.attn_ow)
+        self.module.attention.attn_qkvw = self.quantizer.quantize(self.module.attention.attn_qkvw)
+        self.module.attention.attn_ow = self.quantizer.quantize(self.module.attention.attn_ow)
 
     def mlp_quantization(self):
         self.module.mlp.inter_w = self.quantizer.quantize(self.module.mlp.inter_w)
         self.module.mlp.output_w = self.quantizer.quantize(self.module.mlp.output_w)
 
-    def apply_tensor_parallelism(self, mp_replace):
+    def apply_tensor_parallelism(self, mp_replace=None, mp_group=None, tp_size=None):
+        reversed_dim = False
+        if mp_replace is None:
+            from deepspeed.module_inject import ReplaceWithTensorSlicing
+            mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group, mp_size=tp_size, out_dim=0, in_dim=1)
+            reversed_dim = True
         # setup the new Attention module
-        self.attention_qkv_mp(mp_replace)
-        self.attention_o_mp(mp_replace)
+        if self.module.attention.attn_qkvw is None:
+            self.attention_q_k_v_mp(mp_replace, reversed_dim=reversed_dim)
+        else:
+            self.attention_qkv_mp(mp_replace, reversed_dim=reversed_dim)
+        self.attention_o_mp(mp_replace, reversed_dim=reversed_dim)
 
         # setup the new MLP module
-        self.mlp_inter_mp(mp_replace)
-        self.mlp_output_mp(mp_replace)
+        self.mlp_inter_mp(mp_replace, reversed_dim=reversed_dim)
+        self.mlp_output_mp(mp_replace, reversed_dim=reversed_dim)
 
         # Apply weight quantization
-        self.apply_weight_quantization()
-
-    def attention_qkv_mp(self, mp_replace):
-        self.module.attention.attn_qkvw = mp_replace.qkv_copy(
-            self.module.attention.attn_qkvw,
-            self.qkvw)
-        self.module.attention.attn_qkvb = mp_replace.qkv_copy(
-            self.module.attention.attn_qkvb,
-            self.qkvb)
-
-    def attention_o_mp(self, mp_replace):
-        self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow,
-                                                        self.dense_w)
+        #self.apply_weight_quantization()
+
+    def attention_qkv_mp(self, mp_replace, reversed_dim=False):
+        if reversed_dim:
+            self.module.attention.attn_qkvw = mp_replace.qkv_copy(
+                self.module.attention.attn_qkvw[:self.qkvw.shape[0] // mp_replace.mp_size],
+                self.qkvw,
+                int8=reversed_dim)
+            self.module.attention.attn_qkvb = mp_replace.qkv_copy(
+                self.module.attention.attn_qkvb[:self.qkvw.shape[0] // mp_replace.mp_size],
+                self.qkvb,
+                int8=reversed_dim)
+        else:
+            self.module.attention.attn_qkvw = mp_replace.qkv_copy(self.module.attention.attn_qkvw,
+                                                                  self.qkvw,
+                                                                  int8=reversed_dim)
+            self.module.attention.attn_qkvb = mp_replace.qkv_copy(self.module.attention.attn_qkvb,
+                                                                  self.qkvb,
+                                                                  int8=reversed_dim)
+
+    def attention_q_k_v_mp(self, mp_replace, reversed_dim=False):
+        self.module.attention.attn_qw = mp_replace.copy(self.module.attention.attn_qw[:self.qw.shape[0] //
+                                                                                      mp_replace.mp_size],
+                                                        self.qw,
+                                                        int8=reversed_dim,
+                                                        allocat_tensor=reversed_dim)
+        self.module.attention.attn_kw = mp_replace.copy(self.module.attention.attn_kw[:self.qw.shape[0] //
+                                                                                      mp_replace.mp_size],
+                                                        self.kw,
+                                                        int8=reversed_dim,
+                                                        allocat_tensor=reversed_dim)
+        self.module.attention.attn_vw = mp_replace.copy(self.module.attention.attn_vw[:self.qw.shape[0] //
+                                                                                      mp_replace.mp_size],
+                                                        self.vw,
+                                                        int8=reversed_dim,
+                                                        allocat_tensor=reversed_dim)
+        self.module.attention.attn_qb = mp_replace.copy(
+            self.module.attention.attn_qb[:self.qw.shape[0] // mp_replace.mp_size],
+            self.qb,
+            int8=reversed_dim,
+            allocat_tensor=reversed_dim) if self.module.attention.attn_qb is not None else None
+        self.module.attention.attn_kb = mp_replace.copy(
+            self.module.attention.attn_kb[:self.qw.shape[0] // mp_replace.mp_size],
+            self.kb,
+            int8=reversed_dim,
+            allocat_tensor=reversed_dim) if self.module.attention.attn_kb is not None else None
+        self.module.attention.attn_vb = mp_replace.copy(
+            self.module.attention.attn_vb[:self.qw.shape[0] // mp_replace.mp_size],
+            self.vb,
+            int8=reversed_dim,
+            allocat_tensor=reversed_dim) if self.module.attention.attn_vb is not None else None
+
+    def attention_o_mp(self, mp_replace, reversed_dim=False):
+        if reversed_dim:
+            self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow[:, :self.dense_w.shape[1] //
+                                                                                          mp_replace.mp_size],
+                                                            self.dense_w,
+                                                            int8=reversed_dim,
+                                                            allocat_tensor=reversed_dim)
+        else:
+            self.module.attention.attn_ow = mp_replace.copy(self.module.attention.attn_ow,
+                                                            self.dense_w,
+                                                            int8=reversed_dim)
         self.module.attention.attn_ob = mp_replace.copy(self.module.attention.attn_ob,
-                                                        self.dense_b)
-
-    def mlp_inter_mp(self, mp_replace):
-        self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w)
-        self.module.mlp.inter_b = mp_replace.copy(self.module.mlp.inter_b, self._h4h_b)
-
-    def mlp_output_mp(self, mp_replace):
-        self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w, self._4hh_w)
-        self.module.mlp.output_b = mp_replace.copy(self.module.mlp.output_b, self._4hh_b)
+                                                        self.dense_b,
+                                                        int8=reversed_dim,
+                                                        allocat_tensor=reversed_dim)
+
+    def mlp_inter_mp(self, mp_replace, reversed_dim=False):
+        if reversed_dim:
+            self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w[:self._h4h_w.shape[0] //
+                                                                              mp_replace.mp_size],
+                                                      self._h4h_w,
+                                                      int8=reversed_dim,
+                                                      allocat_tensor=reversed_dim)
+            self.module.mlp.inter_b = mp_replace.copy(
+                self.module.mlp.inter_b[:self._h4h_w.shape[0] // mp_replace.mp_size],
+                self._h4h_b,
+                int8=reversed_dim,
+                allocat_tensor=reversed_dim) if self.module.mlp.inter_b is not None else None
+        else:
+            self.module.mlp.inter_w = mp_replace.copy(self.module.mlp.inter_w, self._h4h_w, int8=reversed_dim)
+            self.module.mlp.inter_b = mp_replace.copy(self.module.mlp.inter_b, self._h4h_b, int8=reversed_dim)
+
+    def mlp_output_mp(self, mp_replace, reversed_dim=False):
+        if reversed_dim:
+            self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w[:, :self._4hh_w.shape[1] //
+                                                                                mp_replace.mp_size],
+                                                       self._4hh_w,
+                                                       int8=reversed_dim,
+                                                       allocat_tensor=reversed_dim)
+        else:
+            self.module.mlp.output_w = mp_replace.copy(self.module.mlp.output_w, self._4hh_w, int8=reversed_dim)
+        self.module.mlp.output_b = mp_replace.copy(self.module.mlp.output_b,
+                                                   self._4hh_b,
+                                                   int8=reversed_dim,
+                                                   allocat_tensor=reversed_dim)
+
+    def release_qkv(self):
+        del self.module.attention.attn_qkvw
+        del self.module.attention.attn_qkvb
+        self.module.attention.attn_qkvw = self.qkvw
+        self.module.attention.attn_qkvb = self.qkvb
+        if self.module.attention.attn_qw is not None:
+            qkv_data = [self.module.attention.attn_qw.data, \
+                        self.module.attention.attn_qb.data if self.module.attention.attn_qb is not None else None, \
+                        self.module.attention.attn_kw.data, \
+                        self.module.attention.attn_kb.data if self.module.attention.attn_kb is not None else None, \
+                        self.module.attention.attn_vw.data, \
+                        self.module.attention.attn_vb.data if self.module.attention.attn_vb is not None else None]
+            for data in qkv_data:
+                del data
+
+            self.module.attention.attn_qw = self.qw
+            self.module.attention.attn_qb = self.qb
+            self.module.attention.attn_kw = self.kw
+            self.module.attention.attn_kb = self.kb
+            self.module.attention.attn_vw = self.vw
+            self.module.attention.attn_vb = self.vb
+
+    def release_memory(self):
+        self.release_qkv()
+        del self.module.attention.attn_ow
+        del self.module.attention.attn_ob
+        self.module.attention.attn_ow = self.dense_w
+        self.module.attention.attn_ob = self.dense_b
+        del self.module.mlp.inter_w
+        del self.module.mlp.inter_b
+        del self.module.mlp.output_w
+        del self.module.mlp.output_b
+        self.module.mlp.inter_w = self._h4h_w
+        self.module.mlp.inter_b = self._h4h_b
+        self.module.mlp.output_w = self._4hh_w
+        self.module.mlp.output_b = self._4hh_b
 
     def copy_data_to_new_module(self):
         if self.attn_nw is None:
             self.module.mlp.attn_nw = self.attn_nw
             self.module.mlp.attn_nb = self.attn_nb
         else:
-            self.module.mlp.attn_nw.data.copy_(
-                self.attn_nw.to(get_accelerator().current_device_name()))
-            self.module.mlp.attn_nb.data.copy_(
-                self.attn_nb.to(get_accelerator().current_device_name()))
+            self.module.mlp.attn_nw.data.copy_(self.attn_nw.to(get_accelerator().current_device_name()))
+            self.module.mlp.attn_nb.data.copy_(self.attn_nb.to(get_accelerator().current_device_name()))
 
-        self.module.norm_w.data.copy_(
-            self.input_nw.to(get_accelerator().current_device_name()))
-        self.module.norm_b.data.copy_(
-            self.input_nb.to(get_accelerator().current_device_name()))
+        self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name()))
+
+    def align_merged_qkv(self):
+        if hasattr(self, '_align_merged_qkv'):
+            self._align_merged_qkv()
+
+    def partition_merged_qkv(self):
+        if hasattr(self, '_partition_merged_qkv'):
+            self._partition_merged_qkv()
 
     def transpose(self):
         self.transpose_attention()
@@ -246,3 +385,110 @@ class BaseTransformerContainer(ABC):
         data = data.reshape(data.shape[-1], data.shape[-2])
         data.to(get_accelerator().current_device_name())
         return data
+
+    def reset_qkv_experimental(self):
+        if self.module.attention.attn_qkvw is None:
+            self.module.attention.attn_qkvw = torch.empty(self.qw.shape[0] * 3,
+                                                          self.qw.shape[0],
+                                                          dtype=self.qw.dtype,
+                                                          device=self.qw.device)
+            self.module.attention.attn_qkvb = torch.empty(self.qw.shape[0] * 3,
+                                                          dtype=self.qw.dtype,
+                                                          device=self.qw.device)
+        self.module.attention.attn_qkvw.data[:self.qw.shape[0]] = self.qw.data
+        self.module.attention.attn_qkvb.data[:self.qw.shape[0]] = self.qb.data
+        self.module.attention.attn_qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kw.data
+        self.module.attention.attn_qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kb.data
+        self.module.attention.attn_qkvw.data[2 * self.qw.shape[0]:] = self.vw.data
+        self.module.attention.attn_qkvb.data[2 * self.qw.shape[0]:] = self.vb.data
+
+        qkv_data = [self.qw.data, \
+                    self.qb.data, \
+                    self.kw.data, \
+                    self.kb.data, \
+                    self.vw.data, \
+                    self.vb.data]
+
+        self.qw.data = self.module.attention.attn_qkvw.data[:self.qw.shape[0]]
+        self.qb.data = self.module.attention.attn_qkvb.data[:self.qw.shape[0]]
+        self.kw.data = self.module.attention.attn_qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.kb.data = self.module.attention.attn_qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.vw.data = self.module.attention.attn_qkvw.data[2 * self.qw.shape[0]:]
+        self.vb.data = self.module.attention.attn_qkvb.data[2 * self.qw.shape[0]:]
+
+        for data in qkv_data:
+            del data
+
+    def reset_qkv(self):
+        self.qkvw.data[:self.qw.shape[0]] = self.qw.data
+        self.qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kw.data
+        self.qkvw.data[2 * self.qw.shape[0]:] = self.vw.data
+        if self.qkvb is not None:
+            self.qkvb.data[:self.qw.shape[0]] = self.qb.data
+            self.qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]] = self.kb.data
+            self.qkvb.data[2 * self.qw.shape[0]:] = self.vb.data
+
+        qkv_data = [self.qw.data, \
+                    self.qb.data if self.qb is not None else None, \
+                    self.kw.data, \
+                    self.kb.data if self.kb is not None else None, \
+                    self.vw.data, \
+                    self.vb.data if self.vb is not None else None]
+
+        self.qw.data = self.qkvw.data[:self.qw.shape[0]]
+        self.kw.data = self.qkvw.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+        self.vw.data = self.qkvw.data[2 * self.qw.shape[0]:]
+
+        if self.qkvb is not None:
+            self.qb.data = self.qkvb.data[:self.qw.shape[0]]
+            self.kb.data = self.qkvb.data[self.qw.shape[0]:2 * self.qw.shape[0]]
+            self.vb.data = self.qkvb.data[2 * self.qw.shape[0]:]
+
+        for data in qkv_data:
+            del data
+
+    def set_params_wo_copy(self, Z3_enabled=False):
+        self.module.mlp.attn_nw = self.attn_nw
+        self.module.mlp.attn_nb = self.attn_nb
+        self.module.norm_w = self.input_nw
+        self.module.norm_b = self.input_nb
+        self.module.mlp.inter_w = self._h4h_w
+        self.module.mlp.inter_b = self._h4h_b
+        self.module.mlp.output_w = self._4hh_w
+        self.module.mlp.output_b = self._4hh_b
+        self.module.attention.attn_ow = self.dense_w
+        self.module.attention.attn_ob = self.dense_b
+        if not Z3_enabled or self.q_k_v is None:
+            self.module.attention.attn_qkvw = self.qkvw
+            self.module.attention.attn_qkvb = self.qkvb
+        if self.q_k_v is not None:
+            if Z3_enabled:
+                self.module.attention.attn_qw = self.qw
+                self.module.attention.attn_qb = self.qb
+                self.module.attention.attn_kw = self.kw
+                self.module.attention.attn_kb = self.kb
+                self.module.attention.attn_vw = self.vw
+                self.module.attention.attn_vb = self.vb
+            else:
+                self.qw.data = self.qkvw[:self.qw.shape[0], :]
+                self.kw.data = self.qkvw[self.qw.shape[0]:2 * self.qw.shape[0], :]
+                self.vw.data = self.qkvw[self.qw.shape[0] * 2:, :]
+                if self.qkvb is not None:
+                    self.qb.data = self.qkvb[:self.qw.shape[0]]
+                    self.kb.data = self.qkvb[self.qw.shape[0]:2 * self.qw.shape[0]]
+                    self.vb.data = self.qkvb[self.qw.shape[0] * 2:]
+
+    def get_lora_params(self):
+        return self.lora_params
+
+    def get_all_params(self):
+        if self.q_k_v is not None:
+            return [
+                self.attn_nw, self.attn_nb, self.input_nw, self.input_nb, self._h4h_w, self._h4h_b, self._4hh_w,
+                self._4hh_b, self.qw, self.qb, self.kw, self.kb, self.vw, self.vb, self.dense_w, self.dense_b
+            ]
+        else:
+            return [
+                self.attn_nw, self.attn_nb, self.input_nw, self.input_nb, self._h4h_w, self._h4h_b, self._4hh_w,
+                self._4hh_b, self.qkvw, self.qkvb, self.dense_w, self.dense_b
+            ]
diff --git a/deepspeed/module_inject/containers/base_moe.py b/deepspeed/module_inject/containers/base_moe.py
index 4139b08d90916709db302b4531f913d3350200aa..4be1b849ba70da04b6b08ea011c27bbbcf96b8bf 100644
--- a/deepspeed/module_inject/containers/base_moe.py
+++ b/deepspeed/module_inject/containers/base_moe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Create a container object to save model-specific tensors using the policy file above.
 from .base import *
@@ -8,6 +11,7 @@ from deepspeed.accelerator import get_accelerator
 
 
 class BaseTransformerMoEContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         # Call the init function of the parent class to initialize the tensors and configs from parent class
         super().__init__(**kwargs)
@@ -16,9 +20,7 @@ class BaseTransformerMoEContainer(BaseTransformerContainer):
         self.ep_world_size = dist.get_world_size()
         self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
 
-        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(
-            self.config,
-            'layer_norm_eps') else 1e-12,
+        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(self.config, 'layer_norm_eps') else 1e-12,
 
         # MoE models will have a list of mlp related tensors
         self._h4h_w = []
@@ -102,40 +104,27 @@ class BaseTransformerMoEContainer(BaseTransformerContainer):
         gpu_index = dist.get_rank()
         for ep_index in range(self.local_ep_size):
             # mlp inter
-            self.module.mlp[ep_index].inter_w.data = self._h4h_w[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
-            self.module.mlp[ep_index].inter_b.data = self._h4h_b[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_w.data = self._h4h_w[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_b.data = self._h4h_b[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
 
             # mlp output
-            self.module.mlp[ep_index].output_w.data = self._4hh_w[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
-            self.module.mlp[ep_index].output_b.data = self._4hh_b[
-                gpu_index * self.local_ep_size + ep_index].to(
-                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_w.data = self._4hh_w[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_b.data = self._4hh_b[gpu_index * self.local_ep_size + ep_index].to(
+                get_accelerator().current_device_name())
 
     def copy_data_to_new_module(self):
-        self.module.attn_nw.data = self.attn_nw.to(
-            get_accelerator().current_device_name())
-        self.module.attn_nb.data = self.attn_nb.to(
-            get_accelerator().current_device_name())
+        self.module.attn_nw.data = self.attn_nw.to(get_accelerator().current_device_name())
+        self.module.attn_nb.data = self.attn_nb.to(get_accelerator().current_device_name())
 
-        self.module.norm_w.data.copy_(
-            self.input_nw.to(get_accelerator().current_device_name()))
-        self.module.norm_b.data.copy_(
-            self.input_nb.to(get_accelerator().current_device_name()))
+        self.module.norm_w.data.copy_(self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(self.input_nb.to(get_accelerator().current_device_name()))
 
         if self.config.moe.type == 'residual':
-            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.output_w.data = self._res_4hh_w.to(
-                get_accelerator().current_device_name())
-            self.module.res_mlp.output_b.data = self._res_4hh_b.to(
-                get_accelerator().current_device_name())
-            self.module.res_coef.data = self._res_coef.to(
-                get_accelerator().current_device_name())
+            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(get_accelerator().current_device_name())
+            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(get_accelerator().current_device_name())
+            self.module.res_mlp.output_w.data = self._res_4hh_w.to(get_accelerator().current_device_name())
+            self.module.res_mlp.output_b.data = self._res_4hh_b.to(get_accelerator().current_device_name())
+            self.module.res_coef.data = self._res_coef.to(get_accelerator().current_device_name())
diff --git a/deepspeed/module_inject/containers/bert.py b/deepspeed/module_inject/containers/bert.py
index 95d8b485a9d602a41a61088aa79b50e991ea888e..f8070655283e1fee939d11161a24e46da4c45d40 100644
--- a/deepspeed/module_inject/containers/bert.py
+++ b/deepspeed/module_inject/containers/bert.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
 
 
 class DS_BERTContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -23,6 +27,7 @@ class DS_BERTContainer(BaseTransformerContainer):
 
 
 class HFBertLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=False):
         super().__init__(inference, pre_attn_norm=False)
         self.client_module = client_module
@@ -39,10 +44,18 @@ class HFBertLayerPolicy(TransformerPolicy):
                 HFBertLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
+        if self.pre_attn_norm:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
         return self.client_module.attention.self.query.weight.shape[1], \
-                self.client_module.attention.self.num_attention_heads
+                self.client_module.attention.self.num_attention_heads, \
+                attention_layernorm.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return None
+
+    def attention(self, enable_training=False):
         qw = self.client_module.attention.self.query.weight
         qb = self.client_module.attention.self.query.bias
         kw = self.client_module.attention.self.key.weight
@@ -50,8 +63,8 @@ class HFBertLayerPolicy(TransformerPolicy):
         vw = self.client_module.attention.self.value.weight
         vb = self.client_module.attention.self.value.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                qkvb, \
@@ -79,3 +92,6 @@ class HFBertLayerPolicy(TransformerPolicy):
                attention_layernorm.bias, \
                transformer_layernorm.weight, \
                transformer_layernorm.bias
+
+    def get_lora_params(self):
+        return []
diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py
index eedf85144fddb7ce59d5a23f69dda1b65ade9fb9..136971583cf7a2189a542b60382e361bf4650b7a 100644
--- a/deepspeed/module_inject/containers/bloom.py
+++ b/deepspeed/module_inject/containers/bloom.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
@@ -7,10 +10,13 @@ from ..policy import TransformerPolicy
 from ..policy import transformer_param_names
 from ..policy import maybe_copy
 
+from ..policy import maybe_get_lora
+
 supported_models = {None}
 
 
 class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -24,13 +30,9 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
         self.module.config.scale_attention = self.scale_attention
         return self.module
 
-    def attention_qkv_mp(self, mp_replace):
-        self.module.attention.attn_qkvw = mp_replace.copy(
-            self.module.attention.attn_qkvw,
-            self.qkvw)
-        self.module.attention.attn_qkvb = mp_replace.copy(
-            self.module.attention.attn_qkvb,
-            self.qkvb)
+    def attention_qkv_mp(self, mp_replace, reversed_dim=False):
+        self.module.attention.attn_qkvw = mp_replace.copy(self.module.attention.attn_qkvw, self.qkvw)
+        self.module.attention.attn_qkvb = mp_replace.copy(self.module.attention.attn_qkvb, self.qkvb)
 
     def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
         param_names = (
@@ -58,58 +60,39 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
                        megatron_v2=self.policy.is_megatron_v2,
                        split_qkv=self.policy.split_qkv)
         for i in range(2, 4):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(4, 10):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(10, 12):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
-                       prefix + param_names[i])
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i])
 
 
 class BLOOMLayerPolicy(TransformerPolicy):
     _orig_layer_class = None
 
-    def __init__(self,
-                 client_module,
-                 inference=True,
-                 use_load_prefix=True,
-                 split_qkv=False):
-        super().__init__(inference,
-                         linear_layer=True,
-                         use_load_prefix=use_load_prefix,
-                         split_qkv=split_qkv)
+    def __init__(self, client_module, inference=True, use_load_prefix=True, split_qkv=False):
+        super().__init__(inference, linear_layer=True, use_load_prefix=use_load_prefix, split_qkv=split_qkv)
         self.client_module = client_module
         try:
             import transformers
             BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
             global supported_models
-            supported_models.update(
-                {transformers.models.bloom.modeling_bloom.BloomModel})
+            supported_models.update({transformers.models.bloom.modeling_bloom.BloomModel})
         except Exception as e:
-            print(
-                f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}"
-            )
+            print(f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}")
             BLOOMLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
         return self.client_module.self_attention.hidden_size, \
-                self.client_module.self_attention.num_heads
+                self.client_module.self_attention.num_heads, \
+                self.client_module.input_layernorm.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return None
+
+    def attention(self, enable_training=False):
         return self.client_module.self_attention.query_key_value.weight, \
                 self.client_module.self_attention.query_key_value.bias, \
                 self.client_module.self_attention.dense.weight, \
@@ -126,3 +109,14 @@ class BLOOMLayerPolicy(TransformerPolicy):
                self.client_module.post_attention_layernorm.bias, \
                self.client_module.input_layernorm.weight, \
                self.client_module.input_layernorm.bias
+
+    def get_lora_params(self):
+        all_lora_params = []
+        for p in [
+            self.client_module.mlp.dense_h_to_4h, \
+            self.client_module.mlp.dense_4h_to_h, \
+            self.client_module.self_attention.query_key_value, \
+            self.client_module.self_attention.dense
+            ]:
+            all_lora_params.append(maybe_get_lora(p))
+        return all_lora_params
diff --git a/deepspeed/module_inject/containers/clip.py b/deepspeed/module_inject/containers/clip.py
index 8e6975091c034804175facd9faeb3cc757d85f7b..144f1b823a1a14aea9fb67bee4c1c6e1f02f359b 100644
--- a/deepspeed/module_inject/containers/clip.py
+++ b/deepspeed/module_inject/containers/clip.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
 
 
 class DS_CLIPContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -21,6 +25,7 @@ class DS_CLIPContainer(BaseTransformerContainer):
 
 
 class HFCLIPLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=False):
         super().__init__(inference, pre_attn_norm=True, scale_attention=True)
         self.client_module = client_module
@@ -35,7 +40,11 @@ class HFCLIPLayerPolicy(TransformerPolicy):
 
     def get_hidden_heads(self):
         return self.client_module.self_attn.q_proj.weight.shape[1], \
-                self.client_module.self_attn.num_heads
+                self.client_module.self_attn.num_heads, \
+                self.client_module.layer_norm1.eps
+
+    def get_q_k_v(self):
+        return None
 
     def attention(self):
         qw = self.client_module.self_attn.q_proj.weight
@@ -64,3 +73,6 @@ class HFCLIPLayerPolicy(TransformerPolicy):
                self.client_module.layer_norm2.bias, \
                self.client_module.layer_norm1.weight, \
                self.client_module.layer_norm1.bias
+
+    def get_lora_params(self):
+        return []
diff --git a/deepspeed/module_inject/containers/distil_bert.py b/deepspeed/module_inject/containers/distil_bert.py
index 71f46dc8ff12f99d3f9666ed93b6c6e0aa9abe78..792b965399e20b2134cedd3dd37747c0894024c9 100644
--- a/deepspeed/module_inject/containers/distil_bert.py
+++ b/deepspeed/module_inject/containers/distil_bert.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
 
 
 class DS_DistilBERTContainer(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -41,9 +45,13 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
 
     def get_hidden_heads(self):
         return self.client_module.attention.q_lin.weight.shape[1], \
-                self.client_module.attention.n_heads
+                self.client_module.attention.n_heads, \
+                self.client_module.sa_layer_norm.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return None
+
+    def attention(self, enable_training=False):
         qw = self.client_module.attention.q_lin.weight
         qb = self.client_module.attention.q_lin.bias
         kw = self.client_module.attention.k_lin.weight
@@ -51,8 +59,8 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
         vw = self.client_module.attention.v_lin.weight
         vb = self.client_module.attention.v_lin.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0))
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0))
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                qkvb, \
@@ -73,3 +81,6 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
                attention_layernorm.bias, \
                transformer_layernorm.weight, \
                transformer_layernorm.bias
+
+    def get_lora_params(self):
+        return []
diff --git a/deepspeed/module_inject/containers/features/__init__.py b/deepspeed/module_inject/containers/features/__init__.py
index 0bd29647d48a3032c6af9304ed3dac2d72eaa376..9bf65591925d488bfa92bf60c2cc82c20ec8942f 100644
--- a/deepspeed/module_inject/containers/features/__init__.py
+++ b/deepspeed/module_inject/containers/features/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .megatron import MegatronContainer
 from .meta_tensor import MetaTensorContainer
diff --git a/deepspeed/module_inject/containers/features/megatron.py b/deepspeed/module_inject/containers/features/megatron.py
index 45a013be913e5bc44d4a14fe3f35462c6eab49c5..cce106fa7e9acdda8c94e7546faf5d30c7815419 100644
--- a/deepspeed/module_inject/containers/features/megatron.py
+++ b/deepspeed/module_inject/containers/features/megatron.py
@@ -1,37 +1,82 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from abc import ABC
 
 
 class MegatronContainer(ABC):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.megatron_v2 = self.policy.is_megatron_v2
 
-    def transpose_qkv_alignment(self, x):
+    def _align_qkv_transposed(self, x):
         attention_head_size = x.shape[-1] // self.num_attention_heads
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
         x_1 = x.view(*new_x_shape)
         (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
         if len(q.shape) > 2:
-            return torch.cat((q.reshape(q.shape[0],
-                                        -1),
-                              k.reshape(q.shape[0],
-                                        -1),
-                              v.reshape(q.shape[0],
-                                        -1)),
+            return torch.cat((q.reshape(q.shape[0], -1), k.reshape(q.shape[0], -1), v.reshape(q.shape[0], -1)),
                              dim=-1).reshape(x.shape)
         else:
-            return torch.cat((q.reshape(-1),
-                              k.reshape(-1),
-                              v.reshape(-1)),
-                             dim=-1).reshape(x.shape)
+            return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape)
+
+    def _align_qkv(self, x):
+        attention_head_size = x.shape[0] // self.num_attention_heads
+        new_x_shape = (self.num_attention_heads, attention_head_size) + x.size()[1:]
+        x_1 = x.view(*new_x_shape)
+        div_dim = len(x_1.size()) - 2 if len(x.shape) == 2 else -1
+        (q, k, v) = torch.split(x_1, (x_1.shape[div_dim] // 3), dim=div_dim)
+        if len(q.shape) > 2:
+            x.data.copy_(
+                torch.cat((q.reshape(-1, q.shape[-1]), k.reshape(-1, q.shape[-1]), v.reshape(-1, q.shape[-1])),
+                          dim=0).reshape(x.shape))
+        else:
+            x.data.copy_(torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape))
+
+    def _align_merged_qkv(self):
+        if hasattr(self.qkvw, 'ds_id'):
+            from deepspeed.runtime.zero import GatheredParameters
+            from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+            param_list = [self.qkvw, self.qkvb]
+            non_active_params = [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+            with GatheredParameters(non_active_params):
+                self._align_qkv(self.qkvw)
+                self._align_qkv(self.qkvb)
+        else:
+            self._align_qkv(self.qkvw)
+            self._align_qkv(self.qkvb)
+
+    def _partition_qkv(self, x):
+        q_k_v = torch.split(x, (x.shape[0] // 3), dim=0)
+        attention_head_size = q_k_v[0].shape[0] // self.num_attention_heads
+        new_x_shape = (self.num_attention_heads, attention_head_size) + x.size()[1:]
+        q, k, v = [data.view(*new_x_shape) for data in q_k_v]
+        if len(q.shape) > 2:
+            x.data.copy_(torch.cat((q, k, v), dim=-2).reshape(-1, q.shape[-1]))
+        else:
+            x.data.copy_(torch.cat((q, k, v), dim=-1).reshape(-1))
+
+    def _partition_merged_qkv(self):
+        if hasattr(self.qkvw, 'ds_id'):
+            from deepspeed.runtime.zero import GatheredParameters
+            from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+            param_list = [self.qkvw, self.qkvb]
+            non_active_params = [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
+            with GatheredParameters(non_active_params):
+                self._partition_qkv(self.qkvw)
+                self._partition_qkv(self.qkvb)
+        else:
+            self._partition_qkv(self.qkvw)
+            self._partition_qkv(self.qkvb)
 
     def transpose(self):
         super().transpose()
         if self.megatron_v2:
-            self.qkvw = torch.nn.parameter.Parameter(
-                self.transpose_qkv_alignment(self.qkvw).contiguous())
-            self.qkvb = torch.nn.parameter.Parameter(
-                self.transpose_qkv_alignment(self.qkvb).contiguous())
+            self.qkvw = torch.nn.parameter.Parameter(self._align_qkv_transposed(self.qkvw).contiguous())
+            self.qkvb = torch.nn.parameter.Parameter(self._align_qkv_transposed(self.qkvb).contiguous())
diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py
index 5b63c5cf5fbddec829027ca349006835b150581e..7aa507ca2e444ad5e5ac9145ff8c705ba9b7c64f 100644
--- a/deepspeed/module_inject/containers/features/meta_tensor.py
+++ b/deepspeed/module_inject/containers/features/meta_tensor.py
@@ -1,26 +1,30 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from abc import ABC, abstractmethod
 
 
 class MetaTensorContainer(ABC):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.is_meta = False
         self.ckpt_load_enabled = True
 
-    def initialize_tensors(self):
-        super().initialize_tensors()
+    def initialize_tensors(self, enable_training=False):
+        super().initialize_tensors(enable_training=enable_training)
         self.is_meta = self.qkvw.is_meta
 
-    def apply_tensor_parallelism(self, mp_replace):
+    def apply_tensor_parallelism(self, mp_replace=None, mp_group=None, tp_size=None):
         if self.is_meta:
             if self.qkvb is None:
                 self.module.attention.attn_qkvb = None
             if self.dense_b is None:
                 self.module.attention.attn_ob = None
         else:
-            super().apply_tensor_parallelism(mp_replace)
+            super().apply_tensor_parallelism(mp_replace, mp_group, tp_size)
 
     def copy_data_to_new_module(self):
         if self.is_meta:
@@ -53,6 +57,5 @@ class MetaTensorContainer(ABC):
                 of q, k, and v are stored together and needs to split in the
                 DeepSpeed-Inference API.
         """
-        raise NotImplementedError(
-            "A load_params() function must be defined in the model container \
+        raise NotImplementedError("A load_params() function must be defined in the model container \
                                   when inheriting the MetaTensorContainer feature")
diff --git a/deepspeed/module_inject/containers/gpt2.py b/deepspeed/module_inject/containers/gpt2.py
index dc194d71d45997b2c0b7f8f7966611f67e5ea4e6..3f6373897c58ebccd918211b7bdc9b875e324677 100644
--- a/deepspeed/module_inject/containers/gpt2.py
+++ b/deepspeed/module_inject/containers/gpt2.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
@@ -6,6 +9,7 @@ from ..policy import TransformerPolicy
 
 
 class DS_GPT2Container(BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -33,9 +37,13 @@ class HFGPT2LayerPolicy(TransformerPolicy):
 
     def get_hidden_heads(self):
         return self.client_module.attn.embed_dim, \
-                self.client_module.attn.num_heads
+                self.client_module.attn.num_heads, \
+                self.client_module.ln_1.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return None
+
+    def attention(self, enable_training=False):
         return  self.client_module.attn.c_attn.weight, \
                 self.client_module.attn.c_attn.bias, \
                 self.client_module.attn.c_proj.weight, \
@@ -52,3 +60,6 @@ class HFGPT2LayerPolicy(TransformerPolicy):
                self.client_module.ln_2.bias, \
                self.client_module.ln_1.weight, \
                self.client_module.ln_1.bias
+
+    def get_lora_params(self):
+        return []
diff --git a/deepspeed/module_inject/containers/gptj.py b/deepspeed/module_inject/containers/gptj.py
index 35472c1007080f50c0fd7bdecf814b9ccca30de9..6929cd2f71467c4d8656e70e0785c952eb555f88 100644
--- a/deepspeed/module_inject/containers/gptj.py
+++ b/deepspeed/module_inject/containers/gptj.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
@@ -10,8 +13,11 @@ from ..policy import transformer_param_names
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
 
+from ..policy import maybe_get_lora
+
 
 class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -36,36 +42,20 @@ class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
             'ln_1.weight', \
             'ln_1.bias'
         )
-        maybe_copy_qkv(
-            module.attention,
-            sd,
-            weight_quantizer,
-            mp_replace,
-            'attn_qkvw',
-            [prefix + param_names[0],
-             prefix + param_names[1],
-             prefix + param_names[2]],
-            split_qkv=self.policy.split_qkv)
-        for i in range(3, 4):
-            maybe_copy(module.attention,
+        maybe_copy_qkv(module.attention,
                        sd,
                        weight_quantizer,
                        mp_replace,
-                       transformer_param_names[i - 1],
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(4, 8):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(8, 10):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i + 2],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i + 2],
                        prefix + param_names[i])
 
 
@@ -82,15 +72,24 @@ class HFGPTJLayerPolicy(TransformerPolicy):
             HFGPTJLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attn.q_proj.weight.shape[1], \
-                self.client_module.attn.num_attention_heads
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_attention_heads, \
+                self.client_module.ln_1.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return self.client_module.attn.q_proj.weight, \
+               None, \
+               self.client_module.attn.k_proj.weight, \
+               None, \
+               self.client_module.attn.v_proj.weight, \
+               None
+
+    def attention(self, enable_training=False):
         qw = self.client_module.attn.q_proj.weight
         kw = self.client_module.attn.k_proj.weight
         vw = self.client_module.attn.v_proj.weight
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                None, \
@@ -108,3 +107,16 @@ class HFGPTJLayerPolicy(TransformerPolicy):
                None, \
                self.client_module.ln_1.weight, \
                self.client_module.ln_1.bias
+
+    def get_lora_params(self):
+        all_lora_params = []
+        for p in [
+            self.client_module.mlp.fc_in, \
+            self.client_module.mlp.fc_out, \
+            self.client_module.attn.q_proj, \
+            self.client_module.attn.k_proj, \
+            self.client_module.attn.v_proj, \
+            self.client_module.attn.out_proj, \
+            ]:
+            all_lora_params.append(maybe_get_lora(p))
+        return all_lora_params
diff --git a/deepspeed/module_inject/containers/gptneo.py b/deepspeed/module_inject/containers/gptneo.py
index a8f206f516dc69dc6414daa9699a157718e98345..146383733ccce845c5f996d8d00758a023195b9b 100644
--- a/deepspeed/module_inject/containers/gptneo.py
+++ b/deepspeed/module_inject/containers/gptneo.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
@@ -10,8 +13,11 @@ from ..policy import transformer_param_names
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
 
+from ..policy import maybe_get_lora
+
 
 class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -39,40 +45,25 @@ class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
             'ln_1.weight', \
             'ln_1.bias'
         )
-        maybe_copy_qkv(
-            module.attention,
-            sd,
-            weight_quantizer,
-            mp_replace,
-            'attn_qkvw',
-            [prefix + param_names[0],
-             prefix + param_names[1],
-             prefix + param_names[2]],
-            split_qkv=self.policy.split_qkv)
-        for i in range(3, 5):
-            maybe_copy(module.attention,
+        maybe_copy_qkv(module.attention,
                        sd,
                        weight_quantizer,
                        mp_replace,
-                       transformer_param_names[i - 1],
+                       'attn_qkvw', [prefix + param_names[0], prefix + param_names[1], prefix + param_names[2]],
+                       split_qkv=self.policy.split_qkv)
+        for i in range(3, 5):
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(5, 11):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 1],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
         for i in range(11, 13):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 1],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 1],
                        prefix + param_names[i])
 
 
 class HFGPTNEOLayerPolicy(TransformerPolicy):
+
     def __init__(self, client_module, inference=True):
         super().__init__(inference, scale_attention=False)
         self.client_module = client_module
@@ -83,15 +74,24 @@ class HFGPTNEOLayerPolicy(TransformerPolicy):
             HFGPTNEOLayerPolicy._orig_layer_class = None
 
     def get_hidden_heads(self):
-        return self.client_module.attn.attention.q_proj.weight.shape[1], \
-                self.client_module.attn.attention.num_heads
+        return self.client_module.attn.attention.embed_dim, \
+                self.client_module.attn.attention.num_heads, \
+                self.client_module.ln_1.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return self.client_module.attn.attention.q_proj.weight, \
+               None, \
+               self.client_module.attn.attention.k_proj.weight, \
+               None, \
+               self.client_module.attn.attention.v_proj.weight, \
+               None
+
+    def attention(self, enable_training=False):
         qw = self.client_module.attn.attention.q_proj.weight
         kw = self.client_module.attn.attention.k_proj.weight
         vw = self.client_module.attn.attention.v_proj.weight
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
 
         return qkvw, \
                None, \
@@ -109,3 +109,16 @@ class HFGPTNEOLayerPolicy(TransformerPolicy):
                self.client_module.ln_2.bias, \
                self.client_module.ln_1.weight, \
                self.client_module.ln_1.bias
+
+    def get_lora_params(self):
+        all_lora_params = []
+        for p in [
+            self.client_module.mlp.c_fc, \
+            self.client_module.mlp.c_proj, \
+            self.client_module.attn.attention.q_proj, \
+            self.client_module.attn.attention.k_proj, \
+            self.client_module.attn.attention.v_proj, \
+            self.client_module.attn.attention.out_proj, \
+            ]:
+            all_lora_params.append(maybe_get_lora(p))
+        return all_lora_params
diff --git a/deepspeed/module_inject/containers/gptneox.py b/deepspeed/module_inject/containers/gptneox.py
index ebf2db0251cca1ded5adf30bbee4ee1a04787313..249e57d2d9c282271f1cacf00780faee33112433 100644
--- a/deepspeed/module_inject/containers/gptneox.py
+++ b/deepspeed/module_inject/containers/gptneox.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
@@ -10,10 +13,11 @@ from ..policy import transformer_param_names
 from ..policy import maybe_copy
 from packaging import version as pkg_version
 
+from ..policy import maybe_get_lora
+
+
+class DS_GPTNEOXContainer(MetaTensorContainer, MegatronContainer, BaseTransformerContainer):
 
-class DS_GPTNEOXContainer(MetaTensorContainer,
-                          MegatronContainer,
-                          BaseTransformerContainer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -57,26 +61,13 @@ class DS_GPTNEOXContainer(MetaTensorContainer,
                        split_qkv=self.policy.split_qkv,
                        heads=self.policy.client_module.attention.num_attention_heads)
         for i in range(2, 4):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(4, 10):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i],
                        prefix + param_names[i])
         for i in range(10, 12):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i],
-                       prefix + param_names[i])
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i], prefix + param_names[i])
 
 
 class GPTNEOXLayerPolicy(TransformerPolicy):
@@ -102,10 +93,14 @@ class GPTNEOXLayerPolicy(TransformerPolicy):
         else:
             attention = self.client_module.self_attention
 
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
+        return self.client_module.attention.hidden_size, \
+                self.client_module.attention.num_attention_heads, \
+                self.client_module.input_layernorm.eps
+
+    def get_q_k_v(self):
+        return None
 
-    def attention(self):
+    def attention(self, enable_training=False):
         if GPTNEOXLayerPolicy.version == 0:
             attention = self.client_module.attention
         else:
@@ -127,3 +122,19 @@ class GPTNEOXLayerPolicy(TransformerPolicy):
                self.client_module.post_attention_layernorm.bias, \
                self.client_module.input_layernorm.weight, \
                self.client_module.input_layernorm.bias
+
+    def get_lora_params(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        all_lora_params = []
+        for p in [
+            self.client_module.mlp.dense_h_to_4h, \
+            self.client_module.mlp.dense_4h_to_h, \
+            attention.query_key_value, \
+            attention.dense
+            ]:
+            all_lora_params.append(maybe_get_lora(p))
+        return all_lora_params
diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py
index 7a8db9108f38c42d1929886108838f9b248a4b22..28b7df5b48d5ed5d46fe981f3ab9acffcd0394f5 100644
--- a/deepspeed/module_inject/containers/megatron_gpt.py
+++ b/deepspeed/module_inject/containers/megatron_gpt.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.megatron import MegatronContainer
@@ -9,6 +12,7 @@ from packaging import version as pkg_version
 
 
 class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -36,9 +40,7 @@ class MegatronLayerPolicy(TransformerPolicy):
     use_mup = False
 
     def __init__(self, client_module, inference=True):
-        super().__init__(inference,
-                         megatron_v2=MegatronLayerPolicy.megatron_v2,
-                         use_mup=MegatronLayerPolicy.use_mup)
+        super().__init__(inference, megatron_v2=MegatronLayerPolicy.megatron_v2, use_mup=MegatronLayerPolicy.use_mup)
         self.client_module = client_module
         # we use megatron version to differentiate between the old and new
         # megatron-lm source code
@@ -54,9 +56,13 @@ class MegatronLayerPolicy(TransformerPolicy):
 
     def get_hidden_heads(self):
         return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
+                self.client_module.attention.num_attention_heads, \
+                self.client_module.input_layernorm.eps
 
-    def attention(self):
+    def get_q_k_v(self):
+        return None
+
+    def attention(self, enable_training=False):
         if self.inference:
             if MegatronLayerPolicy.version == 0:
                 attention = self.client_module.attention
@@ -104,3 +110,6 @@ class MegatronLayerPolicy(TransformerPolicy):
                self.client_module.post_attention_layernorm.bias, \
                self.client_module.input_layernorm.weight, \
                self.client_module.input_layernorm.bias
+
+    def get_lora_params(self):
+        return []
diff --git a/deepspeed/module_inject/containers/megatron_gpt_moe.py b/deepspeed/module_inject/containers/megatron_gpt_moe.py
index 2968161305c453a2611bc91e2a7b4dbbad27aa91..0d5248d8d4d45b5336c9dcb573a1aa0c41dd95e4 100644
--- a/deepspeed/module_inject/containers/megatron_gpt_moe.py
+++ b/deepspeed/module_inject/containers/megatron_gpt_moe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .base_moe import *
@@ -10,6 +13,7 @@ from packaging import version as pkg_version
 
 
 class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
+
     def __init__(self, policy, config, model_config, layer_id):
         super().__init__(policy, config, model_config, layer_id)
 
diff --git a/deepspeed/module_inject/containers/opt.py b/deepspeed/module_inject/containers/opt.py
index 8f9c30bcac8c60beca8dffb388f947a55774e2fe..ff0cad57aa5cf8f118a152b12dd371bcb387e99f 100644
--- a/deepspeed/module_inject/containers/opt.py
+++ b/deepspeed/module_inject/containers/opt.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .base import *
 from .features.meta_tensor import MetaTensorContainer
@@ -9,10 +12,12 @@ from ..policy import TransformerPolicy
 from ..policy import transformer_param_names
 from ..policy import maybe_copy
 from ..policy import maybe_copy_qkv
+from ..policy import maybe_get_lora
 from deepspeed.utils.types import ActivationFuncType
 
 
 class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
+
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
@@ -50,32 +55,16 @@ class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
                            weight_quantizer,
                            mp_replace,
                            transformer_param_names[i // 3],
-                           [
-                               prefix + param_names[i],
-                               prefix + param_names[i + 1],
-                               prefix + param_names[i + 2]
-                           ],
+                           [prefix + param_names[i], prefix + param_names[i + 1], prefix + param_names[i + 2]],
                            split_qkv=self.policy.split_qkv)
         for i in range(6, 8):
-            maybe_copy(module.attention,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module.attention, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
         for i in range(8, 14):
-            maybe_copy(module.mlp,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module.mlp, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
         for i in range(14, 16):
-            maybe_copy(module,
-                       sd,
-                       weight_quantizer,
-                       mp_replace,
-                       transformer_param_names[i - 4],
+            maybe_copy(module, sd, weight_quantizer, mp_replace, transformer_param_names[i - 4],
                        prefix + param_names[i])
 
 
@@ -83,27 +72,40 @@ class HFOPTLayerPolicy(TransformerPolicy):
     _orig_layer_class = None
 
     def __init__(self, client_module, inference=True, use_load_prefix=True):
-        super().__init__(inference,
-                         linear_layer=True,
-                         mlp_act_func_type=ActivationFuncType.ReLU,
-                         pre_attn_norm=True,
-                         use_load_prefix=use_load_prefix)
+        super().__init__(inference, linear_layer=True, pre_attn_norm=True, use_load_prefix=use_load_prefix)
         self.client_module = client_module
-
         try:
             import transformers
             HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
-            if isinstance(TransformerPolicy.hf_model_config,
-                          transformers.models.opt.configuration_opt.OPTConfig):
-                self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before
         except:
             HFOPTLayerPolicy._orig_layer_class = None
 
+        if hasattr(TransformerPolicy, "hf_model_config") and hasattr(TransformerPolicy.hf_model_config,
+                                                                     "activation_function"):
+            if TransformerPolicy.hf_model_config.activation_function == "relu":
+                self.mlp_act_func_type = ActivationFuncType.ReLU
+            elif TransformerPolicy.hf_model_config.activation_function in ["gelu", "gelu_new"]:
+                self.mlp_act_func_type = ActivationFuncType.GELU
+            else:
+                raise ValueError("Unsupported activation function: {}".format(
+                    TransformerPolicy.hf_model_config.activation_function))
+        else:
+            self.mlp_act_func_type = ActivationFuncType.ReLU  # default
+
     def get_hidden_heads(self):
         return self.client_module.self_attn.embed_dim, \
-                self.client_module.self_attn.num_heads
-
-    def attention(self):
+                self.client_module.self_attn.num_heads, \
+                self.client_module.self_attn_layer_norm.eps
+
+    def get_q_k_v(self):
+        return self.client_module.self_attn.q_proj.weight, \
+               self.client_module.self_attn.q_proj.bias, \
+               self.client_module.self_attn.k_proj.weight, \
+               self.client_module.self_attn.k_proj.bias, \
+               self.client_module.self_attn.v_proj.weight, \
+               self.client_module.self_attn.v_proj.bias
+
+    def attention(self, enable_training=False):
         qw = self.client_module.self_attn.q_proj.weight
         qb = self.client_module.self_attn.q_proj.bias
 
@@ -113,9 +115,8 @@ class HFOPTLayerPolicy(TransformerPolicy):
         vw = self.client_module.self_attn.v_proj.weight
         vb = self.client_module.self_attn.v_proj.bias
 
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
-
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=enable_training)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=enable_training)
         return qkvw, \
                qkvb, \
                self.client_module.self_attn.out_proj.weight, \
@@ -132,3 +133,16 @@ class HFOPTLayerPolicy(TransformerPolicy):
                self.client_module.final_layer_norm.bias, \
                self.client_module.self_attn_layer_norm.weight, \
                self.client_module.self_attn_layer_norm.bias
+
+    def get_lora_params(self):
+        all_lora_params = []
+        for p in [
+            self.client_module.fc1, \
+            self.client_module.fc2, \
+            self.client_module.self_attn.q_proj, \
+            self.client_module.self_attn.k_proj, \
+            self.client_module.self_attn.v_proj, \
+            self.client_module.self_attn.out_proj, \
+            ]:
+            all_lora_params.append(maybe_get_lora(p))
+        return all_lora_params
diff --git a/deepspeed/module_inject/containers/unet.py b/deepspeed/module_inject/containers/unet.py
index 461ca12bf36ac436cdb860b653476c6fa8d0c7a9..4e15699dc5a115831d8f0df722861aaced26af7a 100644
--- a/deepspeed/module_inject/containers/unet.py
+++ b/deepspeed/module_inject/containers/unet.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from torch.nn.parameter import Parameter
 
@@ -9,6 +11,7 @@ from ...model_implementations.diffusers.unet import DSUNet
 
 
 class UNetPolicy(DSPolicy):
+
     def __init__(self):
         super().__init__()
         try:
diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py
index c873f9768a8795b6b5869692855e1d1dba7908a1..016e42c3dbb43a301a11c1f0545cf479382fb76f 100644
--- a/deepspeed/module_inject/containers/vae.py
+++ b/deepspeed/module_inject/containers/vae.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from ..policy import DSPolicy
 from ...model_implementations.diffusers.vae import DSVAE
 
 
 class VAEPolicy(DSPolicy):
+
     def __init__(self):
         super().__init__()
         try:
diff --git a/deepspeed/module_inject/inject.py b/deepspeed/module_inject/inject.py
index 384bb7279fdf02d346e2db14f4590e599b00e992..401da1bd6ef76b268f65e48b77c30a8ed65bfc64 100755
--- a/deepspeed/module_inject/inject.py
+++ b/deepspeed/module_inject/inject.py
@@ -1,34 +1,29 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 import torch
 from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 
 
-def module_inject(layer_obj,
-                  model,
-                  config,
-                  micro_batch_size,
-                  max_seq_length,
-                  seed,
-                  preln,
-                  fp16=True):
+def module_inject(layer_obj, model, config, micro_batch_size, max_seq_length, seed, preln, fp16=True):
     for name, child in model.named_children():
         if isinstance(child, layer_obj):
             print('REPLACING BertLayer')
 
-            cuda_config = DeepSpeedTransformerConfig(
-                batch_size=micro_batch_size,
-                max_seq_length=max_seq_length,
-                hidden_size=config.hidden_size,
-                heads=config.num_attention_heads,
-                attn_dropout_ratio=config.attention_probs_dropout_prob,
-                hidden_dropout_ratio=config.hidden_dropout_prob,
-                num_hidden_layers=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                seed=seed,
-                fp16=fp16,
-                pre_layer_norm=preln)
+            cuda_config = DeepSpeedTransformerConfig(batch_size=micro_batch_size,
+                                                     max_seq_length=max_seq_length,
+                                                     hidden_size=config.hidden_size,
+                                                     heads=config.num_attention_heads,
+                                                     attn_dropout_ratio=config.attention_probs_dropout_prob,
+                                                     hidden_dropout_ratio=config.hidden_dropout_prob,
+                                                     num_hidden_layers=config.num_hidden_layers,
+                                                     initializer_range=config.initializer_range,
+                                                     seed=seed,
+                                                     fp16=fp16,
+                                                     pre_layer_norm=preln)
 
             new_module = DeepSpeedTransformerLayer(cuda_config)
 
@@ -71,14 +66,7 @@ def module_inject(layer_obj,
             setattr(model, name, copy.deepcopy(new_module))
 
         else:
-            module_inject(layer_obj,
-                          child,
-                          config,
-                          micro_batch_size,
-                          max_seq_length,
-                          seed,
-                          preln,
-                          fp16)
+            module_inject(layer_obj, child, config, micro_batch_size, max_seq_length, seed, preln, fp16)
 
     return model
 
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
index 0636e89289d549f3666e19e27dc20d58a5ab9873..70dd1a3af0e1345e5d63edaecce4aedd3e6dac8f 100644
--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -10,6 +13,7 @@ from deepspeed.accelerator import get_accelerator
 
 
 class LinearAllreduce(nn.Module):
+
     def __init__(self, weight, bias=None, mp_group=None):
         super(LinearAllreduce, self).__init__()
         self.weight = weight
@@ -26,6 +30,7 @@ class LinearAllreduce(nn.Module):
 
 
 class LinearLayer(nn.Module):
+
     def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
         super(LinearLayer, self).__init__()
         if weight is not None:
@@ -33,9 +38,7 @@ class LinearLayer(nn.Module):
             self.bias = bias
         else:
             self.weight = Parameter(
-                torch.empty(weight_shape,
-                            dtype=dtype,
-                            device=get_accelerator().current_device_name()))
+                torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name()))
 
             self.bias = Parameter(
                 torch.empty(weight_shape[0],
@@ -51,26 +54,35 @@ class LinearLayer(nn.Module):
 
 
 class Normalize(nn.Module):
-    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+
+    def __init__(self, dim=None, dtype=torch.float, eps=1e-5, weight=None, bias=None):
         super(Normalize, self).__init__()
-        self.norm = nn.LayerNorm(dim,
-                                 eps=eps).to(dtype).to(
-                                     get_accelerator().current_device_name())
-        self.weight = self.norm.weight
-        self.bias = self.norm.bias
+        if weight is not None:
+            self.weight = weight
+            self.bias = bias
+        else:
+            self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(get_accelerator().current_device_name())
+            self.weight = self.norm.weight
+            self.bias = self.norm.bias
+
+        self.eps = eps
 
     def forward(self, input):
-        return self.norm(input)
+        return nn.functional.layer_norm(input, input.shape[-1:], self.weight, self.bias, eps=self.eps)
 
 
 class EmbeddingLayer(nn.Module):
-    def __init__(self, weight_shape, dtype=torch.half):
+
+    def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
         super(EmbeddingLayer, self).__init__()
-        self.weight = Parameter(
-            torch.empty(weight_shape[0],
-                        weight_shape[1],
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name()))
+        if weight is None:
+            self.weight = Parameter(
+                torch.empty(weight_shape[0],
+                            weight_shape[1],
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name()))
+        else:
+            self.weight = weight
 
     def forward(self, input):
         return F.embedding(input, self.weight)
@@ -80,20 +92,19 @@ class OPTEmbedding(EmbeddingLayer):
     """
     This module learns positional embeddings up to a fixed maximum size.
     """
-    def __init__(self, weight_shape):
+
+    def __init__(self, weight_shape=None, weight=None, bias=None):
         # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
         # and adjust num_embeddings appropriately. Other models don't have this hack
         self.offset = 2
-        super().__init__(weight_shape)
+        super().__init__(weight_shape, weight=weight)
 
     def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         attention_mask = attention_mask.long()
 
         # create positions depending on attention_mask
-        positions = (torch.cumsum(attention_mask,
-                                  dim=1).type_as(attention_mask) *
-                     attention_mask).long() - 1
+        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1
 
         # cut positions if `past_key_values_length` is > 0
         positions = positions[:, past_key_values_length:]
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index ff8f454d7247e0b10746fca759ea0729ee76fee0..aee47e77bbe9fb4f0c10f33ceb9095ea9d730e20 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch import nn
 from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
@@ -12,6 +15,7 @@ from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
 import torch
 import gc
 from deepspeed.accelerator import get_accelerator
+import re
 
 
 def load_model_with_checkpoint(r_module,
@@ -24,6 +28,15 @@ def load_model_with_checkpoint(r_module,
                                container=None):
     error_msgs = []
 
+    def prefix_check():
+        # if keys start with 'model.', don't skip level 0 prefix
+        for key in sd[0].keys():
+            if re.match("^model[.]", key):
+                return False
+        return True
+
+    skip_level_0_prefix = prefix_check() and container.policy.use_load_prefix
+
     def transpose(data):
         with torch.no_grad():
             data = data.contiguous()
@@ -40,10 +53,8 @@ def load_model_with_checkpoint(r_module,
         if prefix + 'bias' in sd[0].keys():
             if module.bias.data.is_meta:
                 # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
-                module.bias = torch.nn.parameter.Parameter(
-                    data=torch.empty_like(module.bias.data,
-                                          device="cpu"),
-                    requires_grad=module.bias.data.requires_grad)
+                module.bias = torch.nn.parameter.Parameter(data=torch.empty_like(module.bias.data, device="cpu"),
+                                                           requires_grad=module.bias.data.requires_grad)
             module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias'])
         args = None
         gc.collect()
@@ -61,86 +72,62 @@ def load_model_with_checkpoint(r_module,
                             # set the quantizer number of groups using the checkpoint scale shape
                             weight_quantizer.num_groups = scale.shape[0]
                         else:
-                            tmp_data = sd[0][prefix + n].to(
-                                get_accelerator().current_device_name())
+                            tmp_data = sd[0][prefix + n].to(get_accelerator().current_device_name())
                             scale = None
                         src_shape = tmp_data.shape
                         dst_shape = p.shape
                         inner_dim = 1 if tmp_data.dtype == torch.int8 else 0
                         outer_dim = 0 if tmp_data.dtype == torch.int8 else 1
                         if (len(src_shape) == 2 and len(dst_shape) == 2):
-                            if (src_shape[inner_dim] == dst_shape[0]
-                                    and src_shape[outer_dim] == dst_shape[1]):
+                            if (src_shape[inner_dim] == dst_shape[0] and src_shape[outer_dim] == dst_shape[1]):
                                 if tmp_data.dtype != torch.int8:
                                     p = weight_quantizer.quantize(
-                                        transpose(tmp_data) if weight_quantizer.
-                                        q_int8 else tmp_data)
+                                        transpose(tmp_data) if weight_quantizer.q_int8 else tmp_data)
                                 else:
-                                    p = torch.nn.parameter.Parameter(tmp_data,
-                                                                     requires_grad=False)
+                                    p = torch.nn.parameter.Parameter(tmp_data, requires_grad=False)
                                     p.scale = scale
                                 setattr(module, n, p)
                             else:
-                                dim = inner_dim if src_shape[inner_dim] != dst_shape[
-                                    0] else outer_dim
+                                dim = inner_dim if src_shape[inner_dim] != dst_shape[0] else outer_dim
                                 dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1
                                 if src_shape[dim] > dst_shape[dim1]:
-                                    weight_partition = torch.split(
-                                        tmp_data,
-                                        dst_shape[dim1],
-                                        dim=dim)[rank].to(
-                                            get_accelerator().current_device_name())
+                                    weight_partition = torch.split(tmp_data, dst_shape[dim1], dim=dim)[rank].to(
+                                        get_accelerator().current_device_name())
                                     assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \
                                         '''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\
                                            Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
-                                    scale = scale.view(
-                                        -1)[weight_quantizer.num_groups *
-                                            (rank + 1):].reshape(
-                                                weight_quantizer.num_groups,
-                                                -1).contiguous()
+                                    scale = scale.view(-1)[weight_quantizer.num_groups * (rank + 1):].reshape(
+                                        weight_quantizer.num_groups, -1).contiguous()
                                 else:
                                     assert tmp_data.dtype != torch.int8, \
                                         '''Merging of the checkpoints are not supported when using INT8 checkpoint! \
                                           Please use a as many GPUs as TP-size for the checkpoint'''
                                     all_data = [
-                                        sd[j][prefix +
-                                              n] if type(sd[j][prefix + n]) is list else
-                                        sd[j][prefix + n].to(
-                                            get_accelerator().current_device_name())
-                                        for j in range(len(sd))
+                                        sd[j][prefix + n] if type(sd[j][prefix + n]) is list else sd[j][prefix + n].to(
+                                            get_accelerator().current_device_name()) for j in range(len(sd))
                                     ]
                                     # Check if the weight tensor is for the QKV parameter
-                                    if src_shape[1] == (3 *
-                                                        src_shape[0]) // ckpt_mp_size:
+                                    if src_shape[1] == (3 * src_shape[0]) // ckpt_mp_size:
                                         qkv_size = src_shape[outer_dim] // 3
                                         src_split = [
-                                            torch.split(src[0].data,
-                                                        qkv_size,
-                                                        dim=outer_dim)
-                                            for src in all_data
+                                            torch.split(src[0].data, qkv_size, dim=outer_dim) for src in all_data
                                         ]
 
                                         weight_partition = torch.cat([
-                                            torch.cat([qkv_s[i] for qkv_s in src_split],
-                                                      axis=outer_dim)
+                                            torch.cat([qkv_s[i] for qkv_s in src_split], axis=outer_dim)
                                             for i in range(len(src_split[0]))
                                         ],
                                                                      dim=dim)
                                     else:
                                         weight_partition = torch.cat([
-                                            ad[0].to(
-                                                get_accelerator().current_device_name())
-                                            if type(ad) is list else ad
-                                            for ad in all_data
+                                            ad[0].to(get_accelerator().current_device_name())
+                                            if type(ad) is list else ad for ad in all_data
                                         ],
                                                                      dim=dim)
                                     if tmp_data.dtype == torch.int8:
-                                        scale = torch.cat([
-                                            ad[1].to(
-                                                get_accelerator().current_device_name())
-                                            for ad in all_data
-                                        ],
-                                                          dim=dim)
+                                        scale = torch.cat(
+                                            [ad[1].to(get_accelerator().current_device_name()) for ad in all_data],
+                                            dim=dim)
 
                                 if tmp_data.dtype != torch.int8:
                                     weight_partition = weight_quantizer.quantize(
@@ -148,9 +135,8 @@ def load_model_with_checkpoint(r_module,
                                         parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \
                                         weight_quantizer.quantize(weight_partition)
                                 else:
-                                    weight_partition = torch.nn.parameter.Parameter(
-                                        weight_partition,
-                                        requires_grad=False)
+                                    weight_partition = torch.nn.parameter.Parameter(weight_partition,
+                                                                                    requires_grad=False)
                                     weight_partition.scale = scale
                                 setattr(module, n, weight_partition)
                         else:
@@ -158,42 +144,27 @@ def load_model_with_checkpoint(r_module,
                                 p.data.copy_(tmp_data)
                             else:
                                 if src_shape[0] > dst_shape[0]:
-                                    bias_split = torch.split(
-                                        tmp_data,
-                                        dst_shape[-1])[rank].to(get_accelerator(
-                                        ).current_device_name()).contiguous()
+                                    bias_split = torch.split(tmp_data, dst_shape[-1])[rank].to(
+                                        get_accelerator().current_device_name()).contiguous()
                                     p.data.copy_(bias_split)
                                 else:
                                     # Check if the weight tensor is for the QKV parameter
-                                    if src_shape[0] == (3 * r_module.config.hidden_size
-                                                        ) // ckpt_mp_size:
+                                    if src_shape[0] == (3 * r_module.config.hidden_size) // ckpt_mp_size:
                                         qkv_size = src_shape[0] // 3
                                         src_split = [
-                                            torch.split(sd[j][prefix + n],
-                                                        qkv_size,
-                                                        dim=0) for j in range(len(sd))
+                                            torch.split(sd[j][prefix + n], qkv_size, dim=0) for j in range(len(sd))
                                         ]
 
                                         p.data.copy_(
-                                            torch.cat(
-                                                [
-                                                    torch.cat([
-                                                        qkv_s[i] for qkv_s in src_split
-                                                    ],
-                                                              axis=0)
-                                                    for i in range(len(src_split[0]))
-                                                ],
-                                                dim=0).to(get_accelerator(
-                                                ).current_device_name()).contiguous())
+                                            torch.cat([
+                                                torch.cat([qkv_s[i] for qkv_s in src_split], axis=0)
+                                                for i in range(len(src_split[0]))
+                                            ],
+                                                      dim=0).to(get_accelerator().current_device_name()).contiguous())
                                     else:
                                         p.data.copy_(
-                                            torch.cat(
-                                                [
-                                                    sd[j][prefix + n]
-                                                    for j in range(len(sd))
-                                                ],
-                                                dim=0).to(get_accelerator(
-                                                ).current_device_name()).contiguous())
+                                            torch.cat([sd[j][prefix + n] for j in range(len(sd))],
+                                                      dim=0).to(get_accelerator().current_device_name()).contiguous())
 
             load_parameters(module, prefix)
             for n, child in module.named_children():
@@ -239,20 +210,16 @@ def load_model_with_checkpoint(r_module,
                             setattr(module, name, child)
                     continue
                 child_params = list(child.parameters())
-                if len(child_params) > 0 and (child_params[0].numel() == 0
-                                              or child_params[0].is_meta):
+                if len(child_params) > 0 and (child_params[0].numel() == 0 or child_params[0].is_meta):
                     if child.weight.is_meta:
                         ds_shape = child.weight.shape
                     else:
                         ds_shape = child.weight.ds_shape
                     if child.__class__ is nn.LayerNorm:
-                        child = Normalize(dim=ds_shape[-1],
-                                          dtype=child.weight.dtype,
-                                          eps=child.eps)
+                        child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                         setattr(module, name, child)
                     elif child.__class__ is nn.Linear:
-                        child = LinearLayer(weight_shape=child.weight.shape,
-                                            bias=child.bias)
+                        child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias)
                         setattr(module, name, child)
                     elif child.__class__ is OPTLearnedPositionalEmbedding:
                         child = OPTEmbedding(weight_shape=ds_shape)
@@ -261,8 +228,7 @@ def load_model_with_checkpoint(r_module,
                         ds_id = None
                         if hasattr(child.weight, 'ds_id'):
                             ds_id = child.weight.ds_id
-                        child = EmbeddingLayer(weight_shape=ds_shape,
-                                               dtype=child.weight.dtype)
+                        child = EmbeddingLayer(weight_shape=ds_shape, dtype=child.weight.dtype)
                         if ds_id is not None:
                             all_ds_ids[ds_id] = child.weight
                         setattr(module, name, child)
@@ -270,7 +236,7 @@ def load_model_with_checkpoint(r_module,
             else:
                 load_module_recursive(
                     child,
-                    prefix if (level == 0 and ckpt_type == 'pp') and container.policy.use_load_prefix else \
+                    prefix if (level == 0 and ckpt_type == 'pp') and skip_level_0_prefix else \
                     prefix + name + '.',
                     level + 1)
 
diff --git a/deepspeed/module_inject/module_quantize.py b/deepspeed/module_inject/module_quantize.py
index 4123a1214f7b29458ce2f0920f936653c46f04b5..765a7e96bd54f31c7d9ded3698655cc746149327 100755
--- a/deepspeed/module_inject/module_quantize.py
+++ b/deepspeed/module_inject/module_quantize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -18,34 +21,25 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal
     Returns:
         Updated nn.module with quantized transformer layers
     """
+
     def quantize_weight(weight):
         return weight.to(torch.int8)
 
     def megatron_layer_quantize(layer):
-        layer.attention.query_key_value.weight.data = quantize_weight(
-            layer.attention.query_key_value.weight.data)
-        layer.attention.dense.weight.data = quantize_weight(
-            layer.attention.dense.weight.data)
-        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(
-            layer.mlp.dense_h_to_4h.weight.data)
-        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(
-            layer.mlp.dense_4h_to_h.weight.data)
+        layer.attention.query_key_value.weight.data = quantize_weight(layer.attention.query_key_value.weight.data)
+        layer.attention.dense.weight.data = quantize_weight(layer.attention.dense.weight.data)
+        layer.mlp.dense_h_to_4h.weight.data = quantize_weight(layer.mlp.dense_h_to_4h.weight.data)
+        layer.mlp.dense_4h_to_h.weight.data = quantize_weight(layer.mlp.dense_4h_to_h.weight.data)
 
     def bert_layer_quantize(layer):
-        layer.attention.self.query.weight.data = quantize_weight(
-            layer.attention.self.query.weight.data)
-        layer.attention.self.key.weight.data = quantize_weight(
-            layer.attention.self.key.weight.data)
-        layer.attention.self.value.weight.data = quantize_weight(
-            layer.attention.self.value.weight.data)
-        layer.attention.output.dense.weight.data = quantize_weight(
-            layer.attention.output.dense.weight.data)
+        layer.attention.self.query.weight.data = quantize_weight(layer.attention.self.query.weight.data)
+        layer.attention.self.key.weight.data = quantize_weight(layer.attention.self.key.weight.data)
+        layer.attention.self.value.weight.data = quantize_weight(layer.attention.self.value.weight.data)
+        layer.attention.output.dense.weight.data = quantize_weight(layer.attention.output.dense.weight.data)
         if preln:
-            layer.intermediate.dense_act.weight.data = quantize_weight(
-                layer.intermediate.dense_act.weight.data)
+            layer.intermediate.dense_act.weight.data = quantize_weight(layer.intermediate.dense_act.weight.data)
         else:
-            layer.intermediate.dense.weight.data = quantize_weight(
-                layer.intermediate.dense.weight.data)
+            layer.intermediate.dense.weight.data = quantize_weight(layer.intermediate.dense.weight.data)
         layer.output.dense.weight.data = quantize_weight(layer.output.dense.weight.data)
 
     def quantize_fn(child):
@@ -58,9 +52,7 @@ def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=Fal
 
         return child
 
-    return quantize_module(model=model,
-                           orig_class=orig_layer_impl,
-                           quantize_fn=quantize_fn)
+    return quantize_module(model=model, orig_class=orig_layer_impl, quantize_fn=quantize_fn)
 
 
 def quantize_module(model, orig_class, quantize_fn):
diff --git a/deepspeed/module_inject/policy.py b/deepspeed/module_inject/policy.py
index dfd3343e12a34de23fe6660b68dac0257586d1d5..87b34e5aab5aaaf952e2379255ecfff9648828ac 100644
--- a/deepspeed/module_inject/policy.py
+++ b/deepspeed/module_inject/policy.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from abc import ABC, abstractmethod
 from deepspeed.utils.types import ActivationFuncType
 import torch
@@ -70,7 +72,7 @@ class TransformerPolicy(DSPolicy):
         self.split_qkv = split_qkv
 
     @abstractmethod
-    def attention(self):
+    def attention(self, enable_training=False):
         """
         Returns attention qkv and dense parameters
         weight: (3*hidden, hidden) and (hidden, hidden)
@@ -78,6 +80,13 @@ class TransformerPolicy(DSPolicy):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_q_k_v(self):
+        """
+        return all q,k,v parameters without merging them together
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def get_hidden_heads(self):
         """
@@ -103,6 +112,14 @@ class TransformerPolicy(DSPolicy):
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_lora_params(self):
+        """
+        Returns lora parameters used in transformer layer
+
+        """
+        raise NotImplementedError
+
 
 # TODO (lekurile): This function exists in base container as well, consolidate as some point
 def transpose(data):
@@ -124,15 +141,10 @@ def _transpose(x, heads=1, mp_replace=None):
     (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1)
     if len(q.shape) > 2:
         new_shape = (q.shape[0], ) + (-1, )
-        return torch.cat((q.reshape(new_shape),
-                          k.reshape(new_shape),
-                          v.reshape(new_shape)),
+        return torch.cat((q.reshape(new_shape), k.reshape(new_shape), v.reshape(new_shape)),
                          dim=outer_dim).reshape(x.shape)
     else:
-        return torch.cat((q.reshape(-1),
-                          k.reshape(-1),
-                          v.reshape(-1)),
-                         dim=-1).reshape(x.shape)
+        return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape)
 
 
 # This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor.
@@ -156,19 +168,14 @@ def maybe_copy(module,
             else:
                 dst = mp_replace.copy(dst, tmp)
             if qkv and megatron_v2:
-                dst = torch.nn.parameter.Parameter(
-                    _transpose(dst,
-                               heads=heads,
-                               mp_replace=mp_replace).contiguous())
+                dst = torch.nn.parameter.Parameter(_transpose(dst, heads=heads, mp_replace=mp_replace).contiguous())
         else:
             if split_qkv:
                 dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
                                                 (transpose(tmp).contiguous())), int8=weight_quantizer.q_int8)
             else:
                 if qkv and megatron_v2:
-                    tmp = _transpose(transpose(tmp),
-                                     heads=heads,
-                                     mp_replace=mp_replace).contiguous()
+                    tmp = _transpose(transpose(tmp), heads=heads, mp_replace=mp_replace).contiguous()
                     if weight_quantizer.q_int8:
                         tmp = transpose(tmp)
                 dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
@@ -177,13 +184,7 @@ def maybe_copy(module,
 
 
 # Extending the maybe_copy function for when the q, k, and v are in separate parameters!
-def maybe_copy_qkv(module,
-                   sd,
-                   weight_quantizer,
-                   mp_replace,
-                   dst_name,
-                   src_names,
-                   split_qkv=False):
+def maybe_copy_qkv(module, sd, weight_quantizer, mp_replace, dst_name, src_names, split_qkv=False):
     if src_names[0] in sd:
         q = sd[src_names[0]]
         k = sd[src_names[1]]
@@ -203,3 +204,19 @@ def maybe_copy_qkv(module,
                 dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
                                                 transpose(qkv_data)), int8=weight_quantizer.q_int8)
         setattr(module, dst_name, dst)
+
+
+def pack_lora_weights(p):
+    return [
+        p.lora_right_weight, \
+        p.lora_left_weight, \
+        p.lora_scaling
+    ]
+
+
+def maybe_get_lora(p):
+    if hasattr(p, 'lora_right_weight'):
+        lora_param = pack_lora_weights(p)
+    else:
+        lora_param = []
+    return lora_param
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index c9032f9bb76453f23ea7c2677ef298b4f47bb6db..51309ba2afc18e93220703b070eb761848544036 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -23,6 +26,7 @@ from .utils import policy_to_ds_container
 
 
 class ReplaceWithTensorSlicing:
+
     def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
         if mp_group is not None:
             self.gpu_index = dist.get_rank(group=mp_group)
@@ -38,7 +42,7 @@ class ReplaceWithTensorSlicing:
             for merging your checkpoints before replacing the transformer layer with\
             inference-kernels'
 
-    def qkv_copy(self, dst, src, int8=False):
+    def qkv_copy(self, dst, src, int8=False, allocat_tensor=False):
         if src is None:
             return src
         src_shape = src.shape
@@ -47,6 +51,9 @@ class ReplaceWithTensorSlicing:
         outer_dim = 0 if int8 else -1
         inner_dim = -1 if int8 else 0
 
+        if allocat_tensor:
+            dst = torch.empty_like(dst)
+
         src_split = torch.split(src.data, src.shape[outer_dim] // 3, dim=outer_dim)
         if (len(src_shape) == 2 and len(dst_shape) == 2):
             if src_shape[outer_dim] == dst_shape[self.out_dim]:
@@ -55,82 +62,59 @@ class ReplaceWithTensorSlicing:
                 if hasattr(src, 'scale'):
                     dst.scale = src.scale
                 return dst
-            if self.out_dim == 1:
-                self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
-                qkv_size = dst_shape[self.out_dim] // 3
-                qkv_split = [
-                    torch.split(src_s,
-                                qkv_size,
-                                dim=outer_dim) for src_s in src_split
-                ]
-
-                weight_split = [
-                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                              axis=outer_dim) for i in range(len(qkv_split[0]))
-                ]
-                dst = dst.reshape(-1).data.copy_(
-                    weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
-                        weight_split[self.gpu_index].shape)
-            else:
-                dst.data.copy_(src_split[self.gpu_index].to(
-                    get_accelerator().current_device_name()).contiguous())
+            self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+            qkv_size = dst_shape[self.out_dim] // 3
+            qkv_split = [torch.split(src_s, qkv_size, dim=outer_dim) for src_s in src_split]
+            weight_split = [
+                torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=outer_dim) for i in range(len(qkv_split[0]))
+            ]
+            dst = dst.reshape(-1).data.copy_(weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
+                weight_split[self.gpu_index].shape)
         else:
             if src_shape[0] == dst_shape[0]:
                 return torch.nn.parameter.Parameter(src)
-            if self.out_dim == 1:
-                qkv_size = dst_shape[0] // 3
-                qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
-                bias_split = [
-                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                              axis=0) for i in range(len(qkv_split[0]))
-                ]
-                dst.data.copy_(bias_split[self.gpu_index].contiguous())
-            else:
-                dst.data.copy_(src_split[self.gpu_index].contiguous())
+            qkv_size = dst_shape[0] // 3
+            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+            bias_split = [torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=0) for i in range(len(qkv_split[0]))]
+            dst.data.copy_(bias_split[self.gpu_index].contiguous())
 
         dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
         if hasattr(src, 'scale'):
             dst.scale = src.scale
         return dst
 
-    def copy(self, dst, src, int8=False):
+    def copy(self, dst, src, int8=False, allocat_tensor=False):
         if src is None:
             return src
         assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
+        if allocat_tensor:
+            dst = torch.empty_like(dst)
         outer_dim = 0 if int8 else 1
         inner_dim = 1 if int8 else 0
         src_shape = src.shape
         dst_shape = dst.shape
         if (len(src_shape) == 2 and len(dst_shape) == 2):
 
-            if src_shape[inner_dim] == dst_shape[
-                    self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
+            if src_shape[inner_dim] == dst_shape[self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
                 dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
             else:
                 if src_shape[inner_dim] != dst_shape[self.in_dim]:
                     self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
-                    weight_split = torch.split(
-                        src,
-                        dst_shape[self.in_dim],
-                        dim=inner_dim)[self.gpu_index].contiguous()
+                    dst.data.copy_(src[:, self.gpu_index * dst_shape[self.in_dim]: (self.gpu_index + 1) * dst_shape[self.in_dim]] if inner_dim == 1 else \
+                                   src[self.gpu_index * dst_shape[self.in_dim]: (self.gpu_index + 1) * dst_shape[self.in_dim], :])
                 else:
                     self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
-                    weight_split = torch.split(
-                        src.data,
-                        dst_shape[self.out_dim],
-                        dim=outer_dim)[self.gpu_index].contiguous()
-                dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(
-                    weight_split.shape)
+                    dst.data.copy_(src[:, self.gpu_index * dst_shape[self.out_dim]: (self.gpu_index + 1) * dst_shape[self.out_dim]] if outer_dim == 1 else \
+                                   src[self.gpu_index * dst_shape[self.out_dim]: (self.gpu_index + 1) * dst_shape[self.out_dim], :])
         else:
             if src_shape[0] == dst_shape[0]:
-                dst.data.copy_(src)
+                dst = src
             else:
-                bias_split = torch.split(src.data,
-                                         dst_shape[-1])[self.gpu_index].contiguous()
-                dst.data.copy_(bias_split)
+                dst.data.copy_(src[self.gpu_index * dst_shape[-1]:(self.gpu_index + 1) * dst_shape[-1]])
         dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
         if hasattr(src, 'scale'):
             dst.scale = src.scale
+
         return dst
 
 
@@ -150,6 +134,7 @@ def get_transformer_name(replaced_module):
 
 
 class GroupQuantizer:
+
     def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
         self.group_size = group_size
         self.num_bits = num_bits
@@ -163,8 +148,7 @@ class GroupQuantizer:
             inputs.scale = torch.empty(1)
             return inputs
         q_range = 2**self.num_bits
-        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[
-            0] // self.group_size
+        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[0] // self.group_size
         inputs = inputs.to(get_accelerator().current_device_name())
         input_flat = inputs.reshape(num_groups, -1).contiguous()
         input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
@@ -174,31 +158,14 @@ class GroupQuantizer:
         inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
         out = torch.nn.Parameter(inputs_q, requires_grad=False)
         inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
-        input_flat = [
-            inputs_split[i].reshape(num_groups,
-                                    -1).contiguous() for i in range(2)
-        ]
-        input_min = [
-            torch.min(input_flat[i],
-                      dim=1,
-                      keepdim=True)[0].float() for i in range(2)
-        ]
-        input_max = [
-            torch.max(input_flat[i],
-                      dim=1,
-                      keepdim=True)[0].float() for i in range(2)
-        ]
-        scale1 = [
-            (torch.max(input_min[i].abs(),
-                       input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
-            for i in range(2)
-        ]
-
-        out.scale = torch.cat([scale.squeeze().unsqueeze(0),
-                               scale1[0],
-                               scale1[1]],
-                              dim=0).reshape(num_groups,
-                                             -1).contiguous()
+        input_flat = [inputs_split[i].reshape(num_groups, -1).contiguous() for i in range(2)]
+        input_min = [torch.min(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        input_max = [torch.max(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        scale1 = [(torch.max(input_min[i].abs(), input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
+                  for i in range(2)]
+
+        out.scale = torch.cat([scale.squeeze().unsqueeze(0), scale1[0], scale1[1]], dim=0).reshape(num_groups,
+                                                                                                   -1).contiguous()
         return out
 
 
@@ -211,6 +178,7 @@ def _module_match(module):
 
 
 def generic_injection(module, fp16=False, enable_cuda_graph=True):
+
     def replace_attn(child, policy):
         policy_attn = policy.attention(child)
         if policy_attn is None:
@@ -246,8 +214,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True):
 
         attn_module.attn_qkvb = None
         attn_module.attn_ow.data = transpose(attn_ow.data)
-        attn_module.attn_ob.data.copy_(
-            attn_ob.data.to(get_accelerator().current_device_name()))
+        attn_module.attn_ob.data.copy_(attn_ob.data.to(get_accelerator().current_device_name()))
         return attn_module
 
     def replace_attn_block(child, policy):
@@ -262,7 +229,10 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True):
 
         try:
             import diffusers
-            cross_attention = diffusers.models.attention.CrossAttention
+            if hasattr(diffusers.models.attention, 'CrossAttention'):
+                cross_attention = diffusers.models.attention.CrossAttention
+            else:
+                cross_attention = diffusers.models.attention_processor.Attention
             attention_block = diffusers.models.attention.BasicTransformerBlock
             new_policies = {
                 cross_attention: replace_attn,
@@ -278,8 +248,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True):
         #                          triangular_masking=True,
         #                          max_out_tokens=8192)
         from ..model_implementations.transformers.clip_encoder import DSClipEncoder
-        cg_encoder = DSClipEncoder(module.text_encoder,
-                                   enable_cuda_graph=enable_cuda_graph)
+        cg_encoder = DSClipEncoder(module.text_encoder, enable_cuda_graph=enable_cuda_graph)
         setattr(module, 'text_encoder', cg_encoder)
         for name in module.__dict__.keys():
             sub_module = getattr(module, name)
@@ -291,13 +260,11 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True):
                     for name, child in module.named_children():
                         _replace_module(child, policy)
                         if child.__class__ in new_policies:
-                            replaced_module = new_policies[child.__class__](child,
-                                                                            policy)
+                            replaced_module = new_policies[child.__class__](child, policy)
                             setattr(module, name, replaced_module)
 
                 _replace_module(sub_module, policy)
-                new_module = policy.apply(sub_module,
-                                          enable_cuda_graph=enable_cuda_graph)
+                new_module = policy.apply(sub_module, enable_cuda_graph=enable_cuda_graph)
                 print(f"**** found and replaced {name} w. {type(new_module)}")
                 setattr(module, name, new_module)
 
@@ -305,11 +272,7 @@ def generic_injection(module, fp16=False, enable_cuda_graph=True):
 container_g = None
 
 
-def replace_transformer_layer(orig_layer_impl,
-                              model,
-                              checkpoint_dict,
-                              config,
-                              model_config):
+def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, model_config):
     """ Replace bert-style transformer layers with DeepSpeed's transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
@@ -334,15 +297,10 @@ def replace_transformer_layer(orig_layer_impl,
     seed = -1
     local_rank = -1
 
-    mp_replace = ReplaceWithTensorSlicing(
-        mp_group=config.tensor_parallel.tp_group,
-        mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+    mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group,
+                                          mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
 
-    def replace_with_policy(child,
-                            policy_cls,
-                            triangular_masking,
-                            inference=False,
-                            layer_id=0):
+    def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0):
         policy = policy_cls(child, inference=inference)
         if not policy.cuda_graph_supported:
             # policy says cuda graph is not supported raise an error if set
@@ -364,8 +322,7 @@ def replace_transformer_layer(orig_layer_impl,
         _container.set_moe(moe)
 
         # 2. Set the tensor parallelism config
-        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size,
-                                              config.tensor_parallel.tp_group)
+        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
         # 3. Initialize tensors
         _container.initialize_tensors()
@@ -411,25 +368,21 @@ def replace_transformer_layer(orig_layer_impl,
             if name in all_reduce_linears:
                 new_weight = torch.empty((
                     weight_shape[1] if conv_linear_layer else weight_shape[0],
-                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) //
-                    mp_size,
+                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) // mp_size,
                 ),
                                          device=child.weight.device,
                                          dtype=child.weight.dtype)
                 if conv_linear_layer:
                     child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
                 data = mp_replace.copy(new_weight, child.weight.data)
-                new_bias = torch.empty((weight_shape[0]),
-                                       device=child.weight.device,
-                                       dtype=child.weight.dtype)
+                new_bias = torch.empty((weight_shape[0]), device=child.weight.device, dtype=child.weight.dtype)
                 if child.bias is not None:
                     new_bias.data.copy_(child.bias.data)
                 return LinearAllreduce(data, child.bias if child.bias is None else \
                             torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group)
             else:
                 new_weight = torch.empty((
-                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) //
-                    mp_size,
+                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) // mp_size,
                     weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
                 ),
                                          device=child.weight.device,
@@ -441,43 +394,54 @@ def replace_transformer_layer(orig_layer_impl,
                 new_bias = torch.empty((weight_shape[0] // mp_size),
                                        device=child.weight.device,
                                        dtype=child.weight.dtype)
-                bias_data = None if child.bias is None else mp_replace.copy(
-                    new_bias,
-                    child.bias.data).to(get_accelerator().current_device_name())
-                return LinearLayer(weight=data.to(
-                    get_accelerator().current_device_name()),
-                                   bias=bias_data)
+                bias_data = None if child.bias is None else mp_replace.copy(new_bias, child.bias.data).to(
+                    get_accelerator().current_device_name())
+                return LinearLayer(weight=data.to(get_accelerator().current_device_name()), bias=bias_data)
 
         def _slice_embedding(child, name, conv_linear_layer):
             mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-            new_weight = torch.empty((child.weight.shape[0],
-                                      child.weight.shape[1] // mp_size),
+            new_weight = torch.empty((child.weight.shape[0], child.weight.shape[1] // mp_size),
                                      device=child.weight.device,
                                      dtype=child.weight.dtype)
             data = mp_replace.copy(new_weight,
                                    child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \
                                    child.weight.data)
-            new_embedding = nn.Embedding(child.weight.shape[0],
-                                         child.weight.shape[1] // mp_size)
+            new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // mp_size)
             new_embedding.weight.data.copy_(data)
             return new_embedding
 
         def update_mp_params(child):
             if hasattr(child, 'n_heads'):
+                assert child.n_heads % mp_size == 0, "n_heads ({}) must be divisible by mp_size ({})".format(
+                    child.n_heads, mp_size)
                 child.n_heads = child.n_heads // mp_size
             if hasattr(child, 'inner_dim'):
+                assert child.inner_dim % mp_size == 0, "inner_dim ({}) must be divisible by mp_size ({})".format(
+                    child.inner_dim, mp_size)
                 child.inner_dim = child.inner_dim // mp_size
             if hasattr(child, 'num_heads'):
+                assert child.num_heads % mp_size == 0, "num_heads ({}) must be divisible by mp_size ({})".format(
+                    child.num_heads, mp_size)
                 child.num_heads = child.num_heads // mp_size
             if hasattr(child, 'num_attention_heads'):
+                assert child.num_attention_heads % mp_size == 0, "num_attention_heads ({}) must be divisible by mp_size ({})".format(
+                    child.num_attention_heads, mp_size)
                 child.num_attention_heads = child.num_attention_heads // mp_size
             if hasattr(child, 'num_attn_heads'):
+                assert child.num_attn_heads % mp_size == 0, "num_attn_heads ({}) must be divisible by mp_size ({})".format(
+                    child.num_attn_heads, mp_size)
                 child.num_attn_heads = child.num_attn_heads // mp_size
             if hasattr(child, 'all_head_size'):
+                assert child.all_head_size % mp_size == 0, "all_head_size ({}) must be divisible by mp_size ({})".format(
+                    child.all_head_size, mp_size)
                 child.all_head_size = child.all_head_size // mp_size
             if hasattr(child, 'embed_dim'):
+                assert child.embed_dim % mp_size == 0, "embed_dim must ({}) be divisible by mp_size ({})".format(
+                    child.embed_dim, mp_size)
                 child.embed_dim = child.embed_dim // mp_size
             if hasattr(child, 'hidden_size'):
+                assert child.hidden_size % mp_size == 0, "hidden_size ({}) must be divisible by mp_size ({})".format(
+                    child.hidden_size, mp_size)
                 child.hidden_size = child.hidden_size // mp_size
 
         conv_linear_layer = False
@@ -499,12 +463,8 @@ def replace_transformer_layer(orig_layer_impl,
         def _replace_module(r_module, prev_name=''):
             for name, child in r_module.named_children():
                 if child.__class__ in linear_policies:
-                    setattr(
-                        r_module,
-                        name,
-                        linear_policies[child.__class__](child,
-                                                         prev_name + '.' + name,
-                                                         conv_linear_layer))
+                    setattr(r_module, name, linear_policies[child.__class__](child, prev_name + '.' + name,
+                                                                             conv_linear_layer))
                 else:
                     update_mp_params(child)
                     _replace_module(child, name)
@@ -551,15 +511,10 @@ def replace_transformer_layer(orig_layer_impl,
         base_dir1 = checkpoint_dict.get('base_dir', config.base_dir)
 
         if ckpt_type == 'pp' and type(checkpoint) is list:
-            pbar = tqdm.tqdm(total=len(checkpoint),
-                             desc=f"Loading {len(checkpoint)} checkpoint shards")
+            pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
 
             for i in range(len(checkpoint)):
-                sd = [
-                    torch.load(os.path.join(base_dir1,
-                                            checkpoint[i]),
-                               map_location='cpu')
-                ]
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')]
                 load_model_with_checkpoint(replaced_module,
                                            sd,
                                            mp_replace,
@@ -574,22 +529,15 @@ def replace_transformer_layer(orig_layer_impl,
             tp_split_size = (world_size / ckpt_mp_size)
             sd_offset = int(rank / tp_split_size)
             sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
-            pbar = tqdm.tqdm(total=num_checkpoints,
-                             desc=f"Loading {num_checkpoints} checkpoint shards")
+            pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards")
             for i in range(num_checkpoints):
                 pbar.update(1)
                 ckpt_index = i * ckpt_mp_size + sd_offset
                 ckpt_files = [
-                    os.path.join(base_dir1,
-                                 ckpt_list[ckpt_index +
-                                           j]) if base_dir1 else ckpt_list[ckpt_index +
-                                                                           j]
+                    os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
                     for j in range(sd_count)
                 ]
-                sds = [
-                    torch.load(ckpt_file,
-                               map_location='cpu') for ckpt_file in ckpt_files
-                ]
+                sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files]
                 load_model_with_checkpoint(replaced_module,
                                            sds,
                                            mp_replace,
@@ -602,15 +550,13 @@ def replace_transformer_layer(orig_layer_impl,
                 gc.collect()
 
             if "non_tp" in checkpoint:
-                pbar = tqdm.tqdm(
-                    total=len(checkpoint["non_tp"]),
-                    desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
+                pbar = tqdm.tqdm(total=len(checkpoint["non_tp"]),
+                                 desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
 
                 for i in range(len(checkpoint["non_tp"])):
                     pbar.update(1)
                     ckpt_file = os.path.join(base_dir1,
-                                             checkpoint["non_tp"][i]
-                                             ) if base_dir1 else checkpoint["non_tp"][i]
+                                             checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
                     sds = [torch.load(ckpt_file, map_location='cpu')]
                     load_model_with_checkpoint(replaced_module,
                                                sds,
@@ -649,37 +595,22 @@ def replace_transformer_layer(orig_layer_impl,
         if not dist.is_initialized() or dist.get_rank() == 0:
             print("Saving tp-sharded checkpoints")
             torch.save(
-                OrderedDict({
-                    k: v
-                    for k,
-                    v in dict(replaced_module.state_dict()).items()
-                    if transformer_name not in k
-                }),
-                f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+                OrderedDict({k: v
+                             for k, v in dict(replaced_module.state_dict()).items()
+                             if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
             ckpt_config = json.dumps({
-                'type':
-                ckpt_name,
-                'base_dir':
-                f'{config.save_mp_checkpoint_path}',
+                'type': ckpt_name,
+                'base_dir': f'{config.save_mp_checkpoint_path}',
                 'checkpoints': {
-                    "non_tp":
-                    ckpt_files,
-                    "tp": [
-                        f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions)
-                        for r in range(world_size)
-                    ]
+                    "non_tp": ckpt_files,
+                    "tp": [f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions) for r in range(world_size)]
                 },
-                'version':
-                1.0,
-                'parallelization':
-                'tp',
-                'tp_size':
-                world_size,
-                'dtype':
-                'int8' if quantize else ('float16' if fp16 else 'float32')
+                'version': 1.0,
+                'parallelization': 'tp',
+                'tp_size': world_size,
+                'dtype': 'int8' if quantize else ('float16' if fp16 else 'float32')
             })
-            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json",
-                      "w") as cfg:
+            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", "w") as cfg:
                 cfg.write(ckpt_config)
 
         rep_sd = replaced_module.state_dict()
@@ -691,13 +622,9 @@ def replace_transformer_layer(orig_layer_impl,
         for m in range(num_partitions):
             torch.save(
                 OrderedDict({
-                    k: [rep_sd[k],
-                        rep_sd[k].scale] if hasattr(rep_sd[k],
-                                                    'scale') else rep_sd[k]
-                    for k in keys[m * partition_size:(m + 1) * partition_size]
-                    if transformer_name in k
-                }),
-                f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
+                    k: [rep_sd[k], rep_sd[k].scale] if hasattr(rep_sd[k], 'scale') else rep_sd[k]
+                    for k in keys[m * partition_size:(m + 1) * partition_size] if transformer_name in k
+                }), f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
 
     return replaced_module
 
@@ -712,6 +639,7 @@ def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
     Returns:
         Updated nn.module with original bert-style transformer layers
     """
+
     def replace_fn(child, _replace_policy, layer_id):
         #from turing.nvidia_modelingpreln import BertLayer
         orig_module = orig_layer_impl(config)
@@ -813,9 +741,7 @@ def _replace_module(model, policies, layer_id=0):
     """
     for name, child in model.named_children():
         if child.__class__ in policies:
-            replaced_module = policies[child.__class__][0](child,
-                                                           policies[child.__class__][-1],
-                                                           layer_id)
+            replaced_module = policies[child.__class__][0](child, policies[child.__class__][-1], layer_id)
             setattr(model, name, replaced_module)
             if isinstance(model, PipelineModule):
                 assert hasattr(model, 'forward_funcs'),\
diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py
index 65dadcc182cf042e17e57043861291c634c376e4..af58d3d8d2d7c1fef3db258d4de5a335e77c3658 100755
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .containers import HFGPT2LayerPolicy
 from .containers import HFBertLayerPolicy
 from .containers import BLOOMLayerPolicy
@@ -16,16 +18,8 @@ from .containers import VAEPolicy
 
 # transformer-based policies
 replace_policies = [
-    HFBertLayerPolicy,
-    HFGPTNEOLayerPolicy,
-    GPTNEOXLayerPolicy,
-    HFGPTJLayerPolicy,
-    MegatronLayerPolicy,
-    HFGPT2LayerPolicy,
-    BLOOMLayerPolicy,
-    HFOPTLayerPolicy,
-    HFCLIPLayerPolicy,
-    HFDistilBertLayerPolicy
+    HFBertLayerPolicy, HFGPTNEOLayerPolicy, GPTNEOXLayerPolicy, HFGPTJLayerPolicy, MegatronLayerPolicy,
+    HFGPT2LayerPolicy, BLOOMLayerPolicy, HFOPTLayerPolicy, HFCLIPLayerPolicy, HFDistilBertLayerPolicy
 ]
 
 # non-transformer-based policies
diff --git a/deepspeed/module_inject/utils.py b/deepspeed/module_inject/utils.py
index 7ebd797ec31cbc43c0e26127b0b74faa8798aa25..ad60e225fcea24610365cef06bd9a9946c480b02 100644
--- a/deepspeed/module_inject/utils.py
+++ b/deepspeed/module_inject/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import log_dist
 
diff --git a/deepspeed/moe/__init__.py b/deepspeed/moe/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/moe/__init__.py
+++ b/deepspeed/moe/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/moe/experts.py b/deepspeed/moe/experts.py
index 0fa440c2883a91b20254315f0cfffee11b7e5e38..8cadb0c387fa941983fe071dc1dd92d2a08926a2 100644
--- a/deepspeed/moe/experts.py
+++ b/deepspeed/moe/experts.py
@@ -1,17 +1,18 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import copy
 
 
 class Experts(torch.nn.Module):
+
     def __init__(self, expert, num_local_experts=1, expert_group_name=None):
         super(Experts, self).__init__()
 
-        self.deepspeed_experts = torch.nn.ModuleList(
-            [copy.deepcopy(expert) for i in range(num_local_experts)])
+        self.deepspeed_experts = torch.nn.ModuleList([copy.deepcopy(expert) for i in range(num_local_experts)])
         self.num_local_experts = num_local_experts
 
         # TODO: revisit allreduce for moe.gate...
diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py
index 6b4a07642cc8692113ab778d9f83d3d2f42b909f..89fe2bb46c3ca4a9b617d24177471493134848da 100644
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -31,6 +32,7 @@ class MoE(torch.nn.Module):
         use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
         enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
     """
+
     def __init__(self,
                  hidden_size,
                  expert,
@@ -65,15 +67,8 @@ class MoE(torch.nn.Module):
             'Unsupported noisy_gate_policy: ' + noisy_gate_policy
 
         experts = Experts(expert, self.num_local_experts, self.expert_group_name)
-        self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
-                                               num_experts,
-                                               k,
-                                               capacity_factor,
-                                               eval_capacity_factor,
-                                               min_capacity,
-                                               noisy_gate_policy,
-                                               drop_tokens,
-                                               use_rts),
+        self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor,
+                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts),
                                       experts,
                                       self.expert_group_name,
                                       self.ep_size,
@@ -90,20 +85,16 @@ class MoE(torch.nn.Module):
     def _create_process_groups(self):
         # Create process group for a layer if needed
         if self.expert_group_name not in groups._get_expert_parallel_group_dict():
-            print(
-                f"No existing process group found, creating a new group named: {self.expert_group_name}"
-            )
+            print(f"No existing process group found, creating a new group named: {self.expert_group_name}")
             if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
                 # Condition 1 - no groups.mpu means no tensor parallelism
                 # Condition 2 - disabling expert tensor parallelism on purpose
                 groups._create_expert_and_data_parallel(self.ep_size)
             else:
                 # expert tensor parallelism is enabled
-                groups._create_expert_data_and_model_parallel(self.ep_size,
-                                                              mpu=groups.mpu)
+                groups._create_expert_data_and_model_parallel(self.ep_size, mpu=groups.mpu)
         # Set the group handle for the MOELayer (deepspeed_moe) object
-        self.deepspeed_moe._set_ep_group(
-            groups._get_expert_parallel_group(self.expert_group_name))
+        self.deepspeed_moe._set_ep_group(groups._get_expert_parallel_group(self.expert_group_name))
 
     def forward(self, hidden_states, used_token=None):
         """ MoE forward
diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py
index 38f1630a67034990f8bb4e8cc75670e7f6a5f552..6c501ea6503a9a4d9dc2ed6bf70388ec5443e75c 100644
--- a/deepspeed/moe/mappings.py
+++ b/deepspeed/moe/mappings.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # The file has been adapted from the following Megatron-LM file:
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/mappings.py
@@ -32,14 +33,9 @@ def _gather_tokens(input_, dim=0):
     # Size and dimension.
     rank = mpu.get_tensor_model_parallel_rank()
 
-    tensor_list = [
-        torch.empty_like(input_)
-        for _ in range(mpu.get_tensor_model_parallel_world_size())
-    ]
+    tensor_list = [torch.empty_like(input_) for _ in range(mpu.get_tensor_model_parallel_world_size())]
     tensor_list[rank] = input_
-    deepspeed.comm.all_gather(tensor_list,
-                              input_,
-                              group=mpu.get_tensor_model_parallel_group())
+    deepspeed.comm.all_gather(tensor_list, input_, group=mpu.get_tensor_model_parallel_group())
 
     # Note: torch.cat already creates a contiguous tensor.
     output = torch.cat(tensor_list, dim=dim).contiguous()
@@ -53,7 +49,8 @@ def _drop_tokens(input_, dim=0):
 
     total_chunks = mpu.get_tensor_model_parallel_world_size()
     this_chunk = mpu.get_tensor_model_parallel_rank()
-    assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
+    assert input_.shape[
+        dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
     chunk_size = input_.shape[dim] // total_chunks
 
     return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
@@ -61,6 +58,7 @@ def _drop_tokens(input_, dim=0):
 
 class _GatherTokens(torch.autograd.Function):
     """All gather tokens among the tensor parallel ranks"""
+
     @staticmethod
     def symbolic(graph, input_, dim):
         return _gather_tokens(input_, dim)
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index 211b2127b026db48cc63092f9bdaf72e50681e20..93dff21ea70204d05a5260615ccbfb29db13c0ea 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
-# The file has been adapted from two fairscale files:
-# (1) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/moe_layer.py
-# (2) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/top2gate.py
-# Git commit hash: 34df606902a240567a0d898037ece55c2f1336cf
-# We retain the following license from the original files:
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+The file has been adapted from two fairscale files:
+ (1) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/moe_layer.py
+ (2) https://github.com/facebookresearch/fairscale/blob/master/fairscale/nn/moe/top2gate.py
+ Git commit hash: 34df606902a240567a0d898037ece55c2f1336cf
+ We retain the following license from the original files:
+"""
 
 # Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 #
@@ -60,11 +63,9 @@ def multiplicative_jitter(x, device: torch.device, epsilon=1e-2):
         return x
     uniform = uniform_map.get(device)
     if uniform is None:
-        uniform = torch.distributions.uniform.Uniform(
-            low=torch.tensor(1.0 - epsilon,
-                             device=device),
-            high=torch.tensor(1.0 + epsilon,
-                              device=device)).rsample  # type: ignore
+        uniform = torch.distributions.uniform.Uniform(low=torch.tensor(1.0 - epsilon, device=device),
+                                                      high=torch.tensor(1.0 + epsilon,
+                                                                        device=device)).rsample  # type: ignore
         uniform_map[device] = uniform
     return x * uniform(x.shape)
 
@@ -87,6 +88,7 @@ from deepspeed import comm as dist
 
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
+
     @staticmethod
     def forward(
             ctx: Any,
@@ -181,25 +183,18 @@ def top1gating(logits: Tensor,
                noisy_gate_policy: Optional[str] = None,
                drop_tokens: bool = True,
                use_rts: bool = True,
-               use_tutel: bool = False) -> Tuple[Tensor,
-                                                 Tensor,
-                                                 Tensor,
-                                                 Tensor]:
+               use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top1Gating on logits."""
     if noisy_gate_policy == 'RSample':
         logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates,
-                         torch.tensor(capacity_factor),
-                         torch.tensor(min_capacity))
+    capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
     # noisy gating
-    indices1_s = torch.argmax(
-        logits_w_noise if noisy_gate_policy == 'RSample' else gates,
-        dim=1)
+    indices1_s = torch.argmax(logits_w_noise if noisy_gate_policy == 'RSample' else gates, dim=1)
     num_experts = int(gates.shape[1])
     mask1 = F.one_hot(indices1_s, num_classes=num_experts)
 
@@ -225,18 +220,16 @@ def top1gating(logits: Tensor,
     if use_rts:
         uniform = exp_selection_uniform_map.get(logits.device)
         if uniform is None:
-            uniform = torch.distributions.uniform.Uniform(
-                low=torch.tensor(0.0,
-                                 device=logits.device),
-                high=torch.tensor(1.0,
-                                  device=logits.device)).rsample
+            uniform = torch.distributions.uniform.Uniform(low=torch.tensor(0.0, device=logits.device),
+                                                          high=torch.tensor(1.0, device=logits.device)).rsample
             exp_selection_uniform_map[logits.device] = uniform
 
         mask1_rand = mask1 * uniform(mask1.shape)
     else:
         mask1_rand = mask1
 
-    assert logits.shape[0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."
+    assert logits.shape[
+        0] >= min_capacity, "No. of tokens (batch-size) should be greater than min_capacity. Either set min_capacity to 0 or increase your batch size."
 
     top_idx = _top_idx(mask1_rand, capacity)
 
@@ -258,7 +251,13 @@ def top1gating(logits: Tensor,
     if use_tutel:
         gates1_s = (gates * mask1).sum(dim=1)
         locations1_s = torch.sum(locations1 * mask1, dim=1)
-        return l_aux, capacity, num_experts, [indices1_s,], [locations1_s,], [gates1_s,], exp_counts
+        return l_aux, capacity, num_experts, [
+            indices1_s,
+        ], [
+            locations1_s,
+        ], [
+            gates1_s,
+        ], exp_counts
 
     # Store the capacity location for each token
     locations1_s = torch.sum(locations1 * mask1, dim=1)
@@ -275,19 +274,12 @@ def top1gating(logits: Tensor,
     return l_aux, combine_weights, dispatch_mask, exp_counts
 
 
-def top2gating(logits: Tensor,
-               capacity_factor: float,
-               min_capacity: int) -> Tuple[Tensor,
-                                           Tensor,
-                                           Tensor,
-                                           Tensor]:
+def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top2Gating on logits."""
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates,
-                         torch.tensor(capacity_factor * 2),
-                         torch.tensor(min_capacity))
+    capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
     indices1_s = torch.argmax(gates, dim=1)
@@ -393,13 +385,10 @@ class TopKGate(Module):
         self.drop_tokens = drop_tokens
         self.use_rts = use_rts
 
-    def forward(
-            self,
-            input: torch.Tensor,
-            used_token: torch.Tensor = None,
-            use_tutel: bool = False) -> Tuple[Tensor,
-                                              Tensor,
-                                              Tensor]:  # type: ignore
+    def forward(self,
+                input: torch.Tensor,
+                used_token: torch.Tensor = None,
+                use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore
 
         if self.wall_clock_breakdown:
             self.timers('TopKGate').start()
@@ -413,21 +402,13 @@ class TopKGate(Module):
         logits = self.wg(input_fp32)
 
         if self.k == 1:
-            gate_output = top1gating(
-                logits,
-                self.capacity_factor if self.training else self.eval_capacity_factor,
-                self.min_capacity,
-                used_token,
-                self.noisy_gate_policy if self.training else None,
-                self.drop_tokens,
-                self.use_rts,
-                use_tutel)
+            gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity, used_token, self.noisy_gate_policy if self.training else None,
+                                     self.drop_tokens, self.use_rts, use_tutel)
 
         else:
-            gate_output = top2gating(
-                logits,
-                self.capacity_factor if self.training else self.eval_capacity_factor,
-                self.min_capacity)
+            gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity)
 
         if self.wall_clock_breakdown:
             self.timers('TopKGate').stop()
@@ -453,6 +434,7 @@ class MOELayer(Base):
         expert (torch.nn.Module):
             expert network
     """
+
     def __init__(self,
                  gate: Module,
                  experts: Module,
@@ -481,9 +463,8 @@ class MOELayer(Base):
             logger.warning("Tutel optimization requested but not installed. "
                            "Proceeding without Tutel.")
         elif use_tutel and TUTEL_INSTALLED and gate.k != 1:
-            logger.warning(
-                "To enable Tutel optimization, use top-1 instead of top-2 gate. "
-                "Proceeding without Tutel.")
+            logger.warning("To enable Tutel optimization, use top-1 instead of top-2 gate. "
+                           "Proceeding without Tutel.")
 
     def _set_ep_group(self, ep_group):
         self.ep_group = ep_group
@@ -506,18 +487,12 @@ class MOELayer(Base):
             S, M = reshaped_input.size(0), reshaped_input.size(1)
 
             if not hasattr(self, '_tutel_dispatcher'):
-                self._tutel_dispatcher = tutel_moe.fast_dispatcher(
-                    E,
-                    C,
-                    M,
-                    dispatch_dtype=reshaped_input.dtype)
+                self._tutel_dispatcher = tutel_moe.fast_dispatcher(E, C, M, dispatch_dtype=reshaped_input.dtype)
             self._tutel_dispatcher.update(indices_, locations_, gates_, capacity=C)
             dispatched_input = self._tutel_dispatcher.encode(reshaped_input)
         else:
             self.l_aux, combine_weights, dispatch_mask, self.exp_counts = self.gate(reshaped_input, input[1])
-            dispatched_input = einsum("sec,sm->ecm",
-                                      dispatch_mask.type_as(input[0]),
-                                      reshaped_input)
+            dispatched_input = einsum("sec,sm->ecm", dispatch_mask.type_as(input[0]), reshaped_input)
 
         if self.wall_clock_breakdown:
             self.timers('falltoall').start()
@@ -538,10 +513,7 @@ class MOELayer(Base):
             self.time_falltoall = self.timers('falltoall').elapsed(reset=False)
 
         # Re-shape after all-to-all: ecm -> gecm
-        dispatched_input = dispatched_input.reshape(self.ep_size,
-                                                    self.num_local_experts,
-                                                    -1,
-                                                    d_model)
+        dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
 
         expert_output = self.experts(dispatched_input)
 
@@ -555,9 +527,7 @@ class MOELayer(Base):
             self.time_salltoall = self.timers('salltoall').elapsed(reset=False)
 
         # Re-shape back: gecm -> ecm
-        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts,
-                                              -1,
-                                              d_model)
+        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
 
         if groups._get_expert_model_parallel_world_size() == 1:
             # the dropped duplicate tokens need to be gathered on each
@@ -568,9 +538,7 @@ class MOELayer(Base):
         if self.use_tutel:
             combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
         else:
-            combined_output = einsum("sec,ecm->sm",
-                                     combine_weights.type_as(input[0]),
-                                     expert_output)
+            combined_output = einsum("sec,ecm->sm", combine_weights.type_as(input[0]), expert_output)
 
         a = combined_output.reshape(input[0].shape)
 
diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py
index 1bf52795195b50dfab0e69e9c79f737802c1456c..da31f550aabc42a75b2c3b6edf16336fa3290bb1 100644
--- a/deepspeed/moe/utils.py
+++ b/deepspeed/moe/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from typing import List, Tuple, Dict
 import torch
@@ -24,8 +27,7 @@ def is_moe_param(param: torch.Tensor) -> bool:
 
 
 def split_params_into_shared_and_expert_params(
-        params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
-                                                   torch.nn.Parameter]:
+        params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
     shared_params, expert_params = [], []
     for p in params:
         if is_moe_param(p):
@@ -36,8 +38,7 @@ def split_params_into_shared_and_expert_params(
 
 
 def split_params_grads_into_shared_and_expert_params(
-        group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter,
-                                                  torch.nn.Parameter]:
+        group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
     """Split grad of parameters into grads of non-expert params
     and grads of expert params. This is useful while computing
     grad-norms for clipping and overflow detection
@@ -62,8 +63,7 @@ def split_params_grads_into_shared_and_expert_params(
 
 
 def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
-                                                         max_group_size=178956971
-                                                         ) -> Tuple[Dict]:
+                                                         max_group_size=178956971) -> Tuple[Dict]:
     """Split parameters into different MoE groups for optimizer
 
     Args:
@@ -101,8 +101,7 @@ def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dic
                     if ori_key == 'params':
                         group_moe[param_group['name']][key][ori_key] = []
                     else:
-                        group_moe[
-                            param_group['name']][key][ori_key] = param_group[ori_key]
+                        group_moe[param_group['name']][key][ori_key] = param_group[ori_key]
     # Assign param
     for param_group in param_groups:
         new_params = []
diff --git a/deepspeed/monitor/__init__.py b/deepspeed/monitor/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/monitor/__init__.py
+++ b/deepspeed/monitor/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
index 09ba7ef1af47ac096eac02a9a414032547023e95..0cd02603bd3532dd2f4d5abe879a668eb1d59b2c 100644
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
@@ -1,21 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from pydantic import root_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 def get_monitor_config(param_dict):
-    monitor_dict = {
-        key: param_dict.get(key,
-                            {})
-        for key in ("tensorboard",
-                    "wandb",
-                    "csv_monitor")
-    }
+    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")}
     return DeepSpeedMonitorConfig(**monitor_dict)
 
 
@@ -78,10 +71,10 @@ class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
 
     csv_monitor: CSVConfig = {}
     """ Local CSV output of monitoring data. """
+
     @root_validator
     def check_enabled(cls, values):
         values["enabled"] = False
-        if (values.get("tensorboard").enabled or values.get("wandb").enabled
-                or values.get("csv_monitor").enabled):
+        if (values.get("tensorboard").enabled or values.get("wandb").enabled or values.get("csv_monitor").enabled):
             values["enabled"] = True
         return values
diff --git a/deepspeed/monitor/csv_monitor.py b/deepspeed/monitor/csv_monitor.py
index 7de4fbcede58392200a4249f6ec03058bc0ec23d..c7a19b14ad8227fc8187bbe161667285b7d0c717 100644
--- a/deepspeed/monitor/csv_monitor.py
+++ b/deepspeed/monitor/csv_monitor.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .monitor import Monitor
 import os
@@ -7,6 +10,7 @@ import deepspeed.comm as dist
 
 
 class csvMonitor(Monitor):
+
     def __init__(self, csv_config):
         super().__init__(csv_config)
         self.filenames = []
diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py
index 504c3da391f5ce9ce6334729956592973cf554ec..5a32b8bbcadd753ea592e1da28083e1a4c9c5f25 100644
--- a/deepspeed/monitor/monitor.py
+++ b/deepspeed/monitor/monitor.py
@@ -1,6 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
- Support different forms of monitoring such as wandb and tensorboard
+Support different forms of monitoring such as wandb and tensorboard
 """
 
 from abc import ABC, abstractmethod
@@ -8,6 +11,7 @@ import deepspeed.comm as dist
 
 
 class Monitor(ABC):
+
     @abstractmethod
     def __init__(self, monitor_config):
         self.monitor_config = monitor_config
@@ -23,6 +27,7 @@ from .csv_monitor import csvMonitor
 
 
 class MonitorMaster(Monitor):
+
     def __init__(self, monitor_config):
         super().__init__(monitor_config)
         self.tb_monitor = None
diff --git a/deepspeed/monitor/tensorboard.py b/deepspeed/monitor/tensorboard.py
index db3a50d75e236ef21db55eded602109e77eeaceb..985c9ed44b6f5be28785699bca4a3638ed6063bd 100644
--- a/deepspeed/monitor/tensorboard.py
+++ b/deepspeed/monitor/tensorboard.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import check_tb_availability
 from .monitor import Monitor
@@ -8,6 +11,7 @@ import deepspeed.comm as dist
 
 
 class TensorBoardMonitor(Monitor):
+
     def __init__(self, tensorboard_config):
         super().__init__(tensorboard_config)
         check_tb_availability()
@@ -20,9 +24,7 @@ class TensorBoardMonitor(Monitor):
         if self.enabled and dist.get_rank() == 0:
             self.get_summary_writer()
 
-    def get_summary_writer(self,
-                           base=os.path.join(os.path.expanduser("~"),
-                                             "tensorboard")):
+    def get_summary_writer(self, base=os.path.join(os.path.expanduser("~"), "tensorboard")):
         if self.enabled and dist.get_rank() == 0:
             from torch.utils.tensorboard import SummaryWriter
             if self.output_path is not None:
diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py
index aa37e8d27cff5a05898697650d65886cdb30a289..a9bd915f43f361f63a123d01d9c82af1bb71cc37 100644
--- a/deepspeed/monitor/utils.py
+++ b/deepspeed/monitor/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 def check_tb_availability():
diff --git a/deepspeed/monitor/wandb.py b/deepspeed/monitor/wandb.py
index 49fc0332070fde62e9125137f9d310ecd51bfe44..30209191171afc4c0ad3ff7088639f7d678cd505 100644
--- a/deepspeed/monitor/wandb.py
+++ b/deepspeed/monitor/wandb.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .utils import check_wandb_availability
 from .monitor import Monitor
@@ -7,6 +10,7 @@ import deepspeed.comm as dist
 
 
 class WandbMonitor(Monitor):
+
     def __init__(self, wandb_config):
         super().__init__(wandb_config)
         check_wandb_availability()
diff --git a/deepspeed/nebula/__init__.py b/deepspeed/nebula/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/nebula/__init__.py
+++ b/deepspeed/nebula/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/nebula/config.py b/deepspeed/nebula/config.py
index 93b879fd77b78411b554a65d597e6c981781435f..dc49185738c92a3173f2fd5c68f1d6ab5a32dd92 100644
--- a/deepspeed/nebula/config.py
+++ b/deepspeed/nebula/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.nebula.constants import *
 
 
 class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedNebulaConfig, self).__init__()
 
@@ -26,29 +26,18 @@ class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
         self._initialize(nebula_dict)
 
     def _initialize(self, nebula_dict):
-        self.enabled = get_scalar_param(nebula_dict,
-                                        NEBULA_ENABLED,
-                                        NEBULA_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(nebula_dict, NEBULA_ENABLED, NEBULA_ENABLED_DEFAULT)
 
-        self.load_path = get_scalar_param(nebula_dict,
-                                          NEBULA_LOAD_PATH,
-                                          NEBULA_LOAD_PATH_DEFAULT)
+        self.load_path = get_scalar_param(nebula_dict, NEBULA_LOAD_PATH, NEBULA_LOAD_PATH_DEFAULT)
 
-        self.enable_nebula_load = get_scalar_param(nebula_dict,
-                                                   NEBULA_ENABLE_NEBULA_LOAD,
+        self.enable_nebula_load = get_scalar_param(nebula_dict, NEBULA_ENABLE_NEBULA_LOAD,
                                                    NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
 
-        self.persistent_storage_path = get_scalar_param(
-            nebula_dict,
-            NEBULA_PERSISTENT_STORAGE_PATH,
-            NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
+        self.persistent_storage_path = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_STORAGE_PATH,
+                                                        NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
 
-        self.persistent_time_interval = get_scalar_param(
-            nebula_dict,
-            NEBULA_PERSISTENT_TIME_INTERVAL,
-            NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
+        self.persistent_time_interval = get_scalar_param(nebula_dict, NEBULA_PERSISTENT_TIME_INTERVAL,
+                                                         NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
 
-        self.num_of_version_in_retention = get_scalar_param(
-            nebula_dict,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
+        self.num_of_version_in_retention = get_scalar_param(nebula_dict, NEBULA_NUM_OF_VERSION_IN_RETENTION,
+                                                            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py
index 6ad876a8d1a3c996cf8ee5da5230425aed2b09bb..dcc23681bbab6345f08c45f0c6830c34c7ac0535 100644
--- a/deepspeed/nebula/constants.py
+++ b/deepspeed/nebula/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # nebula
@@ -63,24 +62,11 @@ NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
 
 # Neubla envs
 NEBULA_EXPORT_ENVS = [
-    'DLTS_JOB_ID',
-    'DLTS_NUM_WORKER',
-    'NEBULA_PERSISTENT_STORAGE_PATH',
-    'NEBULA_PERSISTENT_TIME_INTERVAL',
-    'AML_RUN_ID',
-    'AZUREML_RUN_TOKEN',
-    'AZUREML_WORKSPACE_SCOPE',
-    'AZUREML_EXPERIMENT_SCOPE',
-    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
-    'AZUREML_RUN_ID',
-    'NEBULA_MEMORY_BUFFER_SIZE',
-    'AZUREML_PARAMETER_ITPJOB_NAME',
-    'FC_TASKROLE_NAME',
-    'FC_TASK_INDEX',
-    'MASTER_HOST',
-    'LOCAL_HOST',
-    'AZUREML_BLOB_ACCOUNT_NAME',
-    'AZUREML_BLOB_ACCOUNT_KEY'
+    'DLTS_JOB_ID', 'DLTS_NUM_WORKER', 'NEBULA_PERSISTENT_STORAGE_PATH', 'NEBULA_PERSISTENT_TIME_INTERVAL',
+    'AML_RUN_ID', 'AZUREML_RUN_TOKEN', 'AZUREML_WORKSPACE_SCOPE', 'AZUREML_EXPERIMENT_SCOPE',
+    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT', 'AZUREML_RUN_ID', 'NEBULA_MEMORY_BUFFER_SIZE',
+    'AZUREML_PARAMETER_ITPJOB_NAME', 'FC_TASKROLE_NAME', 'FC_TASK_INDEX', 'MASTER_HOST', 'LOCAL_HOST',
+    'AZUREML_BLOB_ACCOUNT_NAME', 'AZUREML_BLOB_ACCOUNT_KEY'
 ]
 
 # ITP env files
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index efec4e62c3c973457d5608f4ba87711b53bbca5b..b5a03c458a4630c0b4cc406b60d1cc25d63efffd 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from . import adam
 from . import adagrad
diff --git a/deepspeed/ops/adagrad/__init__.py b/deepspeed/ops/adagrad/__init__.py
index a5ab6de0086c34f369651352592b8810435bf253..5bf2e4721cd61ec3f548d652698f62b7a50772d5 100644
--- a/deepspeed/ops/adagrad/__init__.py
+++ b/deepspeed/ops/adagrad/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .cpu_adagrad import DeepSpeedCPUAdagrad
diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py
index 07cdaa48c11fff906383241304e0d2152f2e35fd..c356a52777f25a9d0fd4b4a1dccd1d238497770b 100755
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.op_builder import CPUAdagradBuilder
@@ -10,13 +11,7 @@ from deepspeed.utils.logging import should_log_le
 class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
     optimizer_id = 0
 
-    def __init__(self,
-                 model_params,
-                 lr=1e-2,
-                 eps=1e-10,
-                 weight_decay=0,
-                 amsgrad=False,
-                 fp32_optimizer_states=True):
+    def __init__(self, model_params, lr=1e-2, eps=1e-10, weight_decay=0, amsgrad=False, fp32_optimizer_states=True):
 
         default_args = dict(lr=lr, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)
         super(DeepSpeedCPUAdagrad, self).__init__(model_params, default_args)
@@ -26,11 +21,7 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
         self.fp32_optimizer_states = fp32_optimizer_states
         self.ds_opt_adagrad = CPUAdagradBuilder().load()
 
-        self.ds_opt_adagrad.create_adagrad(self.opt_id,
-                                           lr,
-                                           eps,
-                                           weight_decay,
-                                           should_log_le("info"))
+        self.ds_opt_adagrad.create_adagrad(self.opt_id, lr, eps, weight_decay, should_log_le("info"))
 
     def __del__(self):
         # need to destroy the C++ object explicitly to avoid a memory leak when deepspeed.initialize
@@ -90,9 +81,7 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
 
                     #memory_format=torch.preserve_format)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device='cpu')
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device='cpu')
                     #memory_format=torch.preserve_format)
 
                 state['step'] += 1
@@ -100,39 +89,21 @@ class DeepSpeedCPUAdagrad(torch.optim.Optimizer):
                 if p.grad.is_sparse == True:
                     sparse_param = p.sparse_mask(p.grad)
                     sparse_exp_avg_sq = state['exp_avg_sq'].sparse_mask(p.grad)
-                    self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                       state['step'],
-                                                       group['lr'],
-                                                       group['eps'],
-                                                       group['weight_decay'],
-                                                       sparse_param.values(),
-                                                       p.grad.values(),
+                    self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                       group['weight_decay'], sparse_param.values(), p.grad.values(),
                                                        sparse_exp_avg_sq.values())
                     p[sparse_param.indices()] = sparse_param.values()
-                    state['exp_avg_sq'][
-                        sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
+                    state['exp_avg_sq'][sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
                     if fp16_param_groups is not None:
-                        fp16_param_groups[group_id][param_id][
-                            sparse_param.indices()] = sparse_param.values()
+                        fp16_param_groups[group_id][param_id][sparse_param.indices()] = sparse_param.values()
                 else:
                     if fp16_param_groups is not None:
-                        self.ds_opt_adagrad.adagrad_update_copy(
-                            self.opt_id,
-                            state['step'],
-                            group['lr'],
-                            group['eps'],
-                            group['weight_decay'],
-                            p.data,
-                            p.grad.data,
-                            state['exp_avg_sq'],
-                            fp16_param_groups[group_id][param_id].data)
+                        self.ds_opt_adagrad.adagrad_update_copy(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                                group['weight_decay'], p.data, p.grad.data,
+                                                                state['exp_avg_sq'],
+                                                                fp16_param_groups[group_id][param_id].data)
                     else:
-                        self.ds_opt_adagrad.adagrad_update(self.opt_id,
-                                                           state['step'],
-                                                           group['lr'],
-                                                           group['eps'],
-                                                           group['weight_decay'],
-                                                           p.data,
-                                                           p.grad.data,
+                        self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                           group['weight_decay'], p.data, p.grad.data,
                                                            state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/adam/__init__.py b/deepspeed/ops/adam/__init__.py
index 111d3175f89e45656d8a543708ff5f5c276a8f97..a29bb9447d015d0992c18624dba41a15aa838866 100755
--- a/deepspeed/ops/adam/__init__.py
+++ b/deepspeed/ops/adam/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .cpu_adam import DeepSpeedCPUAdam
 from .fused_adam import FusedAdam
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 04c5ac34cbe2a1682fa04d7de392a8e6b14b09e7..9fdf7311a764367eb7a359053b925443d894a89e 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from cpuinfo import get_cpu_info
@@ -16,8 +17,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
                  model_params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  weight_decay=0,
                  amsgrad=False,
@@ -76,14 +76,12 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
         super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)
 
         cpu_info = get_cpu_info()
-        self.cpu_vendor = cpu_info["vendor_id_raw"].lower(
-        ) if "vendor_id_raw" in cpu_info else "unknown"
+        self.cpu_vendor = cpu_info["vendor_id_raw"].lower() if "vendor_id_raw" in cpu_info else "unknown"
         if "amd" in self.cpu_vendor:
             for group_id, group in enumerate(self.param_groups):
                 for param_id, p in enumerate(group['params']):
                     if p.dtype == torch.half:
-                        logger.warning(
-                            "FP16 params for CPUAdam may not work on AMD CPUs")
+                        logger.warning("FP16 params for CPUAdam may not work on AMD CPUs")
                         break
                 else:
                     continue
@@ -95,13 +93,7 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
         self.fp32_optimizer_states = fp32_optimizer_states
         self.ds_opt_adam = CPUAdamBuilder().load()
 
-        self.ds_opt_adam.create_adam(self.opt_id,
-                                     lr,
-                                     betas[0],
-                                     betas[1],
-                                     eps,
-                                     weight_decay,
-                                     adamw_mode,
+        self.ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode,
                                      should_log_le("info"))
 
     def __del__(self):
@@ -168,45 +160,22 @@ class DeepSpeedCPUAdam(torch.optim.Optimizer):
                     state_dtype = torch.float if self.fp32_optimizer_states else p.dtype
 
                     # gradient momentums
-                    state['exp_avg'] = torch.zeros_like(p.data,
-                                                        dtype=state_dtype,
-                                                        device=device)
+                    state['exp_avg'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
                     #memory_format=torch.preserve_format)
                     # gradient variances
-                    state['exp_avg_sq'] = torch.zeros_like(p.data,
-                                                           dtype=state_dtype,
-                                                           device=device)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data, dtype=state_dtype, device=device)
                     #memory_format=torch.preserve_format)
 
                 state['step'] += 1
                 beta1, beta2 = group['betas']
 
                 if fp16_param_groups is not None:
-                    self.ds_opt_adam.adam_update_copy(
-                        self.opt_id,
-                        state['step'],
-                        group['lr'],
-                        beta1,
-                        beta2,
-                        group['eps'],
-                        group['weight_decay'],
-                        group['bias_correction'],
-                        p.data,
-                        p.grad.data,
-                        state['exp_avg'],
-                        state['exp_avg_sq'],
-                        fp16_param_groups[group_id][param_id].data)
+                    self.ds_opt_adam.adam_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                                      group['eps'], group['weight_decay'], group['bias_correction'],
+                                                      p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'],
+                                                      fp16_param_groups[group_id][param_id].data)
                 else:
-                    self.ds_opt_adam.adam_update(self.opt_id,
-                                                 state['step'],
-                                                 group['lr'],
-                                                 beta1,
-                                                 beta2,
-                                                 group['eps'],
-                                                 group['weight_decay'],
-                                                 group['bias_correction'],
-                                                 p.data,
-                                                 p.grad.data,
-                                                 state['exp_avg'],
-                                                 state['exp_avg_sq'])
+                    self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
+                                                 group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
+                                                 state['exp_avg'], state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/adam/fused_adam.py b/deepspeed/ops/adam/fused_adam.py
index 169fde67e52e7aece678b34177bd40d296d0fc16..ae7a6f0a87cef226da2d463c5ccea61964c8a944 100644
--- a/deepspeed/ops/adam/fused_adam.py
+++ b/deepspeed/ops/adam/fused_adam.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
-'''
+"""
 
 import torch
 from .multi_tensor_apply import MultiTensorApply
@@ -47,12 +49,12 @@ class FusedAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  adam_w_mode=True,
                  weight_decay=0.,
@@ -61,11 +63,7 @@ class FusedAdam(torch.optim.Optimizer):
 
         if amsgrad:
             raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
-        defaults = dict(lr=lr,
-                        bias_correction=bias_correction,
-                        betas=betas,
-                        eps=eps,
-                        weight_decay=weight_decay)
+        defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay)
         super(FusedAdam, self).__init__(params, defaults)
         self.adam_w_mode = 1 if adam_w_mode else 0
         self.set_grad_none = set_grad_none
@@ -83,12 +81,7 @@ class FusedAdam(torch.optim.Optimizer):
         else:
             super(FusedAdam, self).zero_grad()
 
-    def step(self,
-             closure=None,
-             grads=None,
-             output_params=None,
-             scale=None,
-             grad_norms=None):
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
         """Performs a single optimization step.
 
         Arguments:
@@ -121,8 +114,7 @@ class FusedAdam(torch.optim.Optimizer):
                     continue
                 if p.grad.data.is_sparse:
                     raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
-                    )
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead')
 
                 state = self.state[p]
                 # State initialization
@@ -151,35 +143,13 @@ class FusedAdam(torch.optim.Optimizer):
 
             if (len(g_16) > 0):
                 state['step'] += 1
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_16,
-                                      p_16,
-                                      m_16,
-                                      v_16],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     state['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
+                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_16, p_16, m_16, v_16],
+                                     group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode,
+                                     bias_correction, group['weight_decay'])
             if (len(g_32) > 0):
                 state['step'] += 1
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_32,
-                                      p_32,
-                                      m_32,
-                                      v_32],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     state['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
+                multi_tensor_applier(self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32],
+                                     group['lr'], beta1, beta2, group['eps'], state['step'], self.adam_w_mode,
+                                     bias_correction, group['weight_decay'])
 
         return loss
diff --git a/deepspeed/ops/adam/multi_tensor_apply.py b/deepspeed/ops/adam/multi_tensor_apply.py
index e837309be629d37b0155306d8e1328cbbf8ada5e..0ba228505cef747eea4fec62f3e68707fa4daa0c 100644
--- a/deepspeed/ops/adam/multi_tensor_apply.py
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex, commit a109f85
-'''
+"""
 
 
 class MultiTensorApply(object):
+
     def __init__(self, chunk_size):
         self.chunk_size = chunk_size
 
diff --git a/deepspeed/ops/aio/__init__.py b/deepspeed/ops/aio/__init__.py
index d25f815739aaaa7ec03a2509fe9cd4b4eb330849..6aea264237e366b9978c8bd096a2fbc28945705e 100755
--- a/deepspeed/ops/aio/__init__.py
+++ b/deepspeed/ops/aio/__init__.py
@@ -1,6 +1,6 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..op_builder import AsyncIOBuilder
diff --git a/deepspeed/ops/lamb/__init__.py b/deepspeed/ops/lamb/__init__.py
index 942d76687a601f36091004dca089ded7cd7ee6a0..81bc07e827c88e3f7c134eab78c761813fdf7826 100644
--- a/deepspeed/ops/lamb/__init__.py
+++ b/deepspeed/ops/lamb/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .fused_lamb import FusedLamb
diff --git a/deepspeed/ops/lamb/fused_lamb.py b/deepspeed/ops/lamb/fused_lamb.py
index 33a1461e9883345e6afd17df7dddc6eed56c2be3..6ccd9d4c6b066601bb2f6e5d8d8def6be09fd22b 100644
--- a/deepspeed/ops/lamb/fused_lamb.py
+++ b/deepspeed/ops/lamb/fused_lamb.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
-'''
+"""
 import types
 import torch
 from deepspeed.ops.op_builder import FusedLambBuilder
@@ -35,12 +37,12 @@ class FusedLamb(torch.optim.Optimizer):
         min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
         amsgrad (boolean, optional): NOT SUPPORTED in FusedLamb!
     """
+
     def __init__(self,
                  params,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -64,12 +66,7 @@ class FusedLamb(torch.optim.Optimizer):
         self.eps_mode = 0 if eps_inside_sqrt else 1
         self.lamb_coeffs = []
 
-    def step(self,
-             closure=None,
-             grads=None,
-             output_params=None,
-             scale=1.,
-             grad_norms=None):
+    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
         """Performs a single optimization step.
 
         Arguments:
@@ -114,7 +111,8 @@ class FusedLamb(torch.optim.Optimizer):
         #remove the previous coeffs
         del self.lamb_coeffs[:]
 
-        for group, grads_this_group, output_params_this_group, grad_norm_group in zip(self.param_groups, grads_group, output_params_group, grad_norms):
+        for group, grads_this_group, output_params_this_group, grad_norm_group in zip(
+                self.param_groups, grads_group, output_params_group, grad_norms):
             if grads_this_group is None:
                 grads_this_group = [None] * len(group['params'])
             if output_params_this_group is None:
@@ -127,7 +125,8 @@ class FusedLamb(torch.optim.Optimizer):
 
             bias_correction = 1 if group['bias_correction'] else 0
 
-            for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group, grad_norm_group):
+            for p, grad, output_param, grad_norm in zip(group['params'], grads_this_group, output_params_this_group,
+                                                        grad_norm_group):
 
                 # compute combined scale factor for this group
                 combined_scale = scale
@@ -162,24 +161,10 @@ class FusedLamb(torch.optim.Optimizer):
 
                 state['step'] += 1
 
-                out_p = torch.tensor(
-                    [],
-                    dtype=torch.float) if output_param is None else output_param
-                lamb_coeff = self.fused_lamb_cuda.lamb(p.data,
-                                                       out_p,
-                                                       exp_avg,
-                                                       exp_avg_sq,
-                                                       grad,
-                                                       group['lr'],
-                                                       beta1,
-                                                       beta2,
-                                                       max_coeff,
-                                                       min_coeff,
-                                                       group['eps'],
-                                                       combined_scale,
-                                                       state['step'],
-                                                       self.eps_mode,
-                                                       bias_correction,
+                out_p = torch.tensor([], dtype=torch.float) if output_param is None else output_param
+                lamb_coeff = self.fused_lamb_cuda.lamb(p.data, out_p, exp_avg, exp_avg_sq, grad, group['lr'], beta1,
+                                                       beta2, max_coeff, min_coeff, group['eps'], combined_scale,
+                                                       state['step'], self.eps_mode, bias_correction,
                                                        group['weight_decay'])
                 self.lamb_coeffs.append(lamb_coeff)
         return loss
diff --git a/deepspeed/ops/module_inject.py b/deepspeed/ops/module_inject.py
deleted file mode 100755
index 6b0d47cb67334997902202cbc14ae55fd32a31da..0000000000000000000000000000000000000000
--- a/deepspeed/ops/module_inject.py
+++ /dev/null
@@ -1,216 +0,0 @@
-import copy
-import torch
-import deepspeed
-
-from deepspeed.ops import DeepSpeedTransformerConfig
-
-
-def _copy_child_transformer_state(new_module, orig_child, pre_layer_norm):
-    # copy relevant state from original child -> new module
-    qw = orig_child.attention.self.query.weight
-    qb = orig_child.attention.self.query.bias
-    kw = orig_child.attention.self.key.weight
-    kb = orig_child.attention.self.key.bias
-    vw = orig_child.attention.self.value.weight
-    vb = orig_child.attention.self.value.bias
-
-    qkvw = torch.cat((qw, kw, vw), 0)
-    qkvb = torch.cat((qb, kb, vb), 0)
-
-    #qw.data,kw.data,vw.data = torch.chunk(qkvw, 3, axis=0)
-    #qb.data,kb.data,vb.data = torch.chunk(qkvb, 3, axis=0)
-
-    new_module.attn_qkvw.data = qkvw
-    new_module.attn_qkvb.data = qkvb
-    new_module.attn_ow.data = orig_child.attention.output.dense.weight
-    new_module.attn_ob.data = orig_child.attention.output.dense.bias
-    if pre_layer_norm:
-        attention_layernorm = orig_child.PostAttentionLayerNorm
-    else:
-        attention_layernorm = orig_child.attention.output.LayerNorm
-    new_module.attn_nw.data = attention_layernorm.weight
-    new_module.attn_nb.data = attention_layernorm.bias
-    if pre_layer_norm:
-        intermediate_ff = orig_child.intermediate.dense_act
-    else:
-        intermediate_ff = orig_child.intermediate.dense
-    new_module.inter_w.data = intermediate_ff.weight
-    new_module.inter_b.data = intermediate_ff.bias
-    new_module.output_w.data = orig_child.output.dense.weight
-    new_module.output_b.data = orig_child.output.dense.bias
-    if pre_layer_norm:
-        transformer_layernorm = orig_child.PreAttentionLayerNorm
-    else:
-        transformer_layernorm = orig_child.output.LayerNorm
-    new_module.norm_w.data = transformer_layernorm.weight
-    new_module.norm_b.data = transformer_layernorm.bias
-
-
-def _replace_transformer_layer(orig_layer_impl, model, transformer_config):
-    """ Replace bert-style transformer layers with DeepSpeed's transformer layer
-    Arguments:
-        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
-            e.g., transformers.modeling_bert.BertLayer.
-        model (torch.nn.Module): user's nn.module representing their model
-        transformer_config (dict): deepspeed transformer layer config containing hidden size, attention heads, etc.
-    Returns:
-        Updated nn.module with replaced transformer layers
-    """
-    def replace_fn(child):
-        new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
-        _copy_child_transformer_state(new_module,
-                                      child,
-                                      transformer_config.pre_layer_norm)
-
-        return new_module
-
-    return _replace_module(model=model,
-                           orig_class=orig_layer_impl,
-                           replace_fn=replace_fn)
-
-
-def replace_module(orig_module_impl, model, replacement_module_config):
-    """ Replace client module
-    Arguments:
-        orig_module_impl (torch.nn.Module): original module implementation to replace,
-            e.g., transformers.modeling_bert.BertLayer.
-        model (torch.nn.Module): user's nn.module representing their model
-        replacement_module_config (dict): deepspeed replacement module config (e.g., DeepSpeedTransformerConfig) .
-
-    Returns:
-        Updated nn.module with replaced modules
-    """
-    assert isinstance(replacement_module_config, DeepSpeedTransformerConfig), \
-        'Only DeepSpeedTransformerConfig is currently supported as replacement config'
-
-    return _replace_transformer_layer(orig_layer_impl=orig_module_impl,
-                                      model=model,
-                                      transformer_config=replacement_module_config)
-
-
-def _revert_transformer_layer(orig_layer_impl, model, bert_config, transformer_config):
-    """ Revert DeepSpeed's transformer layer back to original bert-style transformer layer
-    Arguments:
-        orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
-            e.g., transformers.modeling_bert.BertLayer.
-        model (torch.nn.Module): user's nn.module representing their model
-        bert_config (dict): model config containing hidden size, attention heads, etc.
-        transformer_config (dict): deepspeed tranformer config used for replacement
-
-    Returns:
-        Updated nn.module with original bert-style transformer layers
-    """
-    def replace_fn(child):
-        #from turing.nvidia_modelingpreln import BertLayer
-        orig_module = orig_layer_impl(bert_config)
-
-        # copy relevant state from child -> original module
-        qkvw = child.attn_qkvw.data
-        qkvb = child.attn_qkvb.data
-
-        qw, kw, vw = torch.chunk(qkvw, 3, axis=0)
-        qb, kb, vb = torch.chunk(qkvb, 3, axis=0)
-
-        orig_module.attention.self.query.weight.data = qw
-        orig_module.attention.self.query.bias.data = qb
-        orig_module.attention.self.key.weight.data = kw
-        orig_module.attention.self.key.bias.data = kb
-        orig_module.attention.self.value.weight.data = vw
-        orig_module.attention.self.value.bias.data = vb
-
-        orig_module.attention.output.dense.weight.data = child.attn_ow.data
-        orig_module.attention.output.dense.bias.data = child.attn_ob.data
-
-        attn_ln_w = child.attn_nw.data
-        attn_ln_b = child.attn_nb.data
-        if transformer_config.pre_layer_norm:
-            orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w
-            orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b
-        else:
-            orig_module.attention.output.LayerNorm.weight.data = attn_ln_w
-            orig_module.attention.output.LayerNorm.bias.data = attn_ln_b
-
-        inter_ff_w = child.inter_w.data
-        inter_ff_b = child.inter_b.data
-        if transformer_config.pre_layer_norm:
-            orig_module.intermediate.dense_act.weight.data = inter_ff_w
-            orig_module.intermediate.dense_act.bias.data = inter_ff_b
-        else:
-            orig_module.intermediate.dense.weight.data = inter_ff_w
-            orig_module.intermediate.dense.bias.data = inter_ff_b
-
-        orig_module.output.dense.weight.data = child.output_w.data
-        orig_module.output.dense.bias.data = child.output_b.data
-
-        transformer_ln_w = child.norm_w.data
-        transformer_ln_b = child.norm_b.data
-        if transformer_config.pre_layer_norm:
-            orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w
-            orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b
-        else:
-            orig_module.output.LayerNorm.weight.data = transformer_ln_w
-            orig_module.output.LayerNorm.bias.data = transformer_ln_b
-        return orig_module
-
-    return _replace_module(model=model,
-                           orig_class=deepspeed.DeepSpeedTransformerLayer,
-                           replace_fn=replace_fn)
-
-
-def revert_module(orig_module_impl,
-                  model,
-                  orig_module_config,
-                  replacement_module_config):
-    """ Revert DeepSpeed's module back to original client module
-    Arguments:
-        orig_module_impl (torch.nn.Module): the original module that was replaced,
-        e.g., transformers.modeling_bert.BertLayer.
-        model (torch.nn.Module): user's nn.module representing their model
-        orig_module_config (dict): original module configuration
-        replacement_module_config (dict): replacement deepspeed module configuration
-
-    Returns:
-        Updated nn.module with original bert-style transformer layers
-    """
-    assert isinstance(replacement_module_config, DeepSpeedTransformerConfig), \
-        'Only DeepSpeedTransformerConfig is currently supported as replacement config'
-
-    return _revert_transformer_layer(orig_layer_impl=orig_module_impl,
-                                     model=model,
-                                     bert_config=orig_module_config,
-                                     transformer_config=replacement_module_config)
-
-
-def _replace_module(model, orig_class, replace_fn):
-    """ Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
-    Arguments:
-        model (torch.nn.Module): the model to augment
-        orig_class (torch.nn.Module): the module to search for
-        replace_fn (method): a method to convert instances of ``orig_class`` to the
-                             desired type and return a new instance.
-
-    Returns:
-        A modified ``model``.
-    """
-    policy = {orig_class: replace_fn}
-    return _replace_module_using_policies(model, policy)
-
-
-def _replace_module_using_policies(model, policies):
-    """ Traverse model's children recursively and apply any transformations in ``policies``.
-    Arguments:
-        model (torch.nn.Module): model to augment
-        policies (dict): Mapping of source class to replacement function.
-
-    Returns:
-        Modified ``model``.
-    """
-    for name, child in model.named_children():
-        if child.__class__ in policies:
-            orig = repr(child)
-            setattr(model, name, policies[child.__class__](child))
-            new = getattr(model, name)
-        else:
-            _replace_module_using_policies(child, policies)
-
-    return model
diff --git a/deepspeed/ops/quantizer/__init__.py b/deepspeed/ops/quantizer/__init__.py
index 922256532d43e9264dfe2fb0edc61ec7f380dae1..0a5d1da5e3ae0fa097e7313ddb1328c4f910801d 100644
--- a/deepspeed/ops/quantizer/__init__.py
+++ b/deepspeed/ops/quantizer/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .quantizer import ds_quantizer
diff --git a/deepspeed/ops/quantizer/quantizer.py b/deepspeed/ops/quantizer/quantizer.py
index 31cfa96d60c323aedc760710cd6298f480691998..eb4bfd35700075f3b32db329c5f7026b39bef520 100755
--- a/deepspeed/ops/quantizer/quantizer.py
+++ b/deepspeed/ops/quantizer/quantizer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 
 from deepspeed.ops.op_builder import QuantizerBuilder
diff --git a/deepspeed/ops/random_ltd/__init__.py b/deepspeed/ops/random_ltd/__init__.py
index 34b0dd30e294302ba73c2a2dc91e865644e2d056..1ade5811ce76d5b7bf51881db5ff0824c1fd379d 100644
--- a/deepspeed/ops/random_ltd/__init__.py
+++ b/deepspeed/ops/random_ltd/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .dropping_utils import gpt_sample_tokens, bert_sample_tokens, GatherTokens, ScatterTokens
diff --git a/deepspeed/ops/random_ltd/dropping_utils.py b/deepspeed/ops/random_ltd/dropping_utils.py
index 102ffe13d9d187a3a1bcbfb7db2d24ebaaabbab3..bc491716b7a8688143e51a1858fd1dbd5adfe538 100644
--- a/deepspeed/ops/random_ltd/dropping_utils.py
+++ b/deepspeed/ops/random_ltd/dropping_utils.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 
 from deepspeed.ops.op_builder import RandomLTDBuilder
@@ -23,9 +25,7 @@ def gpt_sample_tokens(reserved_length: int,
     prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
     sampled_indices = torch.multinomial(prob_dist, reserved_length)
 
-    sampled_indices = sampled_indices.reshape(layers,
-                                              batch_size,
-                                              reserved_length).to(torch.int32)
+    sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32)
     global random_ltd_module
     if random_ltd_module is None:
         random_ltd_module = RandomLTDBuilder().load()
@@ -59,9 +59,7 @@ def bert_sample_tokens(reserved_length: int,
     prob_dist = torch.ones((layers * batch_size, seq_length), device=device)
     sampled_indices = torch.multinomial(prob_dist, reserved_length)
 
-    sampled_indices = sampled_indices.reshape(layers,
-                                              batch_size,
-                                              reserved_length).to(torch.int32)
+    sampled_indices = sampled_indices.reshape(layers, batch_size, reserved_length).to(torch.int32)
     global random_ltd_module
     if random_ltd_module is None:
         random_ltd_module = RandomLTDBuilder().load()
@@ -82,11 +80,9 @@ def bert_sample_tokens(reserved_length: int,
 
 
 class GatherTokens(torch.autograd.Function):
+
     @staticmethod
-    def forward(ctx,
-                activations: torch.Tensor,
-                sorted_indices: torch.Tensor,
-                batch_first: bool):
+    def forward(ctx, activations: torch.Tensor, sorted_indices: torch.Tensor, batch_first: bool):
         global random_ltd_module
         if random_ltd_module is None:
             random_ltd_module = RandomLTDBuilder().load()
@@ -104,25 +100,18 @@ class GatherTokens(torch.autograd.Function):
         activations, sorted_indices = ctx.saved_tensors
         batch_first = ctx.batch_first
 
-        return random_ltd_module.token_scatter_(a_gradients,
-                                                g_gradients,
-                                                sorted_indices,
-                                                batch_first), None, None
+        return random_ltd_module.token_scatter_(a_gradients, g_gradients, sorted_indices, batch_first), None, None
 
 
 class ScatterTokens(torch.autograd.Function):
+
     @staticmethod
-    def forward(ctx,
-                all_activations: torch.Tensor,
-                layer_activations: torch.Tensor,
-                sorted_indices: torch.Tensor,
+    def forward(ctx, all_activations: torch.Tensor, layer_activations: torch.Tensor, sorted_indices: torch.Tensor,
                 batch_first: bool):
         global random_ltd_module
         if random_ltd_module is None:
             random_ltd_module = RandomLTDBuilder().load()
-        scatter_results = random_ltd_module.token_scatter_(all_activations.clone(),
-                                                           layer_activations,
-                                                           sorted_indices,
+        scatter_results = random_ltd_module.token_scatter_(all_activations.clone(), layer_activations, sorted_indices,
                                                            batch_first)
 
         ctx.save_for_backward(sorted_indices)
@@ -139,7 +128,5 @@ class ScatterTokens(torch.autograd.Function):
         sorted_indices, = ctx.saved_tensors
         batch_first = ctx.batch_first
 
-        ret_val = random_ltd_module.token_gather(out_gradients,
-                                                 sorted_indices,
-                                                 batch_first)
+        ret_val = random_ltd_module.token_gather(out_gradients, sorted_indices, batch_first)
         return out_gradients, ret_val, None, None
diff --git a/deepspeed/ops/sparse_attention/__init__.py b/deepspeed/ops/sparse_attention/__init__.py
index 870d73768d1d45535d02e477a5633cc8d16e8e00..e357257869f794a06d575bfa378769f8e6d3d43c 100644
--- a/deepspeed/ops/sparse_attention/__init__.py
+++ b/deepspeed/ops/sparse_attention/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .sparsity_config import SparsityConfig, DenseSparsityConfig, FixedSparsityConfig, VariableSparsityConfig, BigBirdSparsityConfig, BSLongformerSparsityConfig, LocalSlidingWindowSparsityConfig
 from .sparse_self_attention import SparseSelfAttention
diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
index 6c134d71f2b53240d78f2b4c26826044162d4a8f..e25621bd0977c44d4c1f1d653207fd5c0fd192be 100755
--- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch import nn
 from deepspeed.ops.sparse_attention import SparseSelfAttention, FixedSparsityConfig
@@ -13,6 +14,7 @@ class BertSparseSelfAttention(nn.Module):
 
     For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
     """
+
     def __init__(
         self,
         config,
@@ -29,10 +31,8 @@ class BertSparseSelfAttention(nn.Module):
 
         super(BertSparseSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -44,8 +44,7 @@ class BertSparseSelfAttention(nn.Module):
         self.sparse_self_attention = SparseSelfAttention(sparsity_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py
index 17b0898fdd0abf558e9c5cf03fd390b6fa22db2f..b30028fffbaafaf6869156406e0f8d70c8e97538 100755
--- a/deepspeed/ops/sparse_attention/matmul.py
+++ b/deepspeed/ops/sparse_attention/matmul.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
@@ -12,29 +15,8 @@ from deepspeed.accelerator import get_accelerator
 
 
 @triton.jit
-def _kernel(A,
-            B,
-            C,
-            stride_za,
-            stride_ha,
-            stride_ma,
-            stride_ka,
-            stride_zb,
-            stride_hb,
-            stride_kb,
-            stride_nb,
-            stride_zc,
-            stride_hc,
-            stride_mc,
-            stride_nc,
-            DS0,
-            DS1,
-            SDD_K,
-            SDD_off_width,
-            lut,
-            locks,
-            nlocks,
-            **meta):
+def _kernel(A, B, C, stride_za, stride_ha, stride_ma, stride_ka, stride_zb, stride_hb, stride_kb, stride_nb, stride_zc,
+            stride_hc, stride_mc, stride_nc, DS0, DS1, SDD_K, SDD_off_width, lut, locks, nlocks, **meta):
     TM = meta['TM']
     TN = meta['TN']
     TK = meta['TK']
@@ -194,8 +176,7 @@ def _kernel(A,
         tl.store(pc, c, mask=checkc)
     # accumulate partial results using spin-locks
     else:
-        plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id(
-            1) * nlocks + lockid - 1
+        plock = locks + tl.program_id(2) * nlocks * tl.num_programs(1) + tl.program_id(1) * nlocks + lockid - 1
         pcount = plock + tl.num_programs(2) * tl.num_programs(1) * nlocks
         while tl.atomic_cas(plock, 0, 1) == 1:
             pass
@@ -292,10 +273,7 @@ class _sparse_matmul(torch.autograd.Function):
         #segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width)
         start_width = (128 if block > 16 else 32) // block
         layout = layout.type(torch.int32)
-        segmented = libtriton.superblock(layout.data_ptr(),
-                                         layout.shape[0],
-                                         layout.shape[1],
-                                         layout.shape[2],
+        segmented = libtriton.superblock(layout.data_ptr(), layout.shape[0], layout.shape[1], layout.shape[2],
                                          start_width)
         luts, widths, packs = [], [], []
         for size, nnz in segmented:
@@ -317,19 +295,7 @@ class _sparse_matmul(torch.autograd.Function):
         return luts, None, widths, packs
 
     @staticmethod
-    def _sdd_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    luts,
-                    num_locks,
-                    widths,
-                    packs,
-                    bench,
-                    time):
+    def _sdd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, luts, num_locks, widths, packs, bench, time):
         if trans_c:
             a, b = b, a
             trans_a, trans_b = not trans_b, not trans_a
@@ -339,9 +305,8 @@ class _sparse_matmul(torch.autograd.Function):
         b_dim = -1 if trans_b else -2
         a_inner, b_inner = a.shape[a_dim], b.shape[b_dim]
         if a_inner != b_inner:
-            raise ValueError(
-                f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size "
-                f"of tensor B along the {b_dim} dim ({b_inner})")
+            raise ValueError(f"Size of tensor A along the {a_dim} dim ({a_inner}) must match size "
+                             f"of tensor B along the {b_dim} dim ({b_inner})")
         if a_inner % 16 != 0:
             raise ValueError('Reduction size for SDD must be a multiple of 16')
 
@@ -356,12 +321,7 @@ class _sparse_matmul(torch.autograd.Function):
         device = a.device
         # create kernel
         total_width = sum([width * pack * pack for width, pack in zip(widths, packs)])
-        c = torch.empty((batch_size,
-                         total_width,
-                         block,
-                         block),
-                        dtype=dtype,
-                        device=a.device)
+        c = torch.empty((batch_size, total_width, block, block), dtype=dtype, device=a.device)
         for lut, width, pack in zip(luts, widths, packs):
             F32TK = [8, 16]
             F16TK = [16]
@@ -387,12 +347,7 @@ class _sparse_matmul(torch.autograd.Function):
             max_width = 49152
             total = 0 if bench else None
             for off_width in range(0, width, max_width):
-                grid = lambda meta: [
-                    meta['TZ'],
-                    min(max_width,
-                        width - off_width),
-                    batch_size
-                ]
+                grid = lambda meta: [meta['TZ'], min(max_width, width - off_width), batch_size]
                 _kernel[grid](a,
                               b,
                               c,
@@ -504,13 +459,7 @@ class _sparse_matmul(torch.autograd.Function):
         # create header
         width = column.size(0)
         offsets += 6 * width
-        header = torch.stack((offsets,
-                              segments,
-                              column,
-                              depth,
-                              lockid,
-                              maxid),
-                             dim=1).view(-1).contiguous()
+        header = torch.stack((offsets, segments, column, depth, lockid, maxid), dim=1).view(-1).contiguous()
         incs = torch.stack((xincs, wincs), dim=1).view(-1).contiguous()
         incs = torch.cat((incs, torch.zeros(2, device=incs.device, dtype=incs.dtype)))
         # create lut
@@ -521,19 +470,7 @@ class _sparse_matmul(torch.autograd.Function):
         return lut, num_locks, width, None
 
     @staticmethod
-    def _dds_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    lut,
-                    num_locks,
-                    width,
-                    packs,
-                    bench,
-                    time):
+    def _dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time):
         global triton
         if triton is None:
             triton = importlib.import_module('triton')
@@ -548,16 +485,7 @@ class _sparse_matmul(torch.autograd.Function):
         BS2 = block * spdims[1 if trans_b else 2]
         dtype = a.dtype
         # kernel
-        meta = {
-            'TN': block,
-            'TM': 128,
-            'TK': 16,
-            'BLOCK': block,
-            'TZ': 1,
-            'SDD': False,
-            'DSD': False,
-            'DDS': True
-        }
+        meta = {'TN': block, 'TM': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': False, 'DDS': True}
         # output
         CS0 = AS0
         CS1 = AS1
@@ -593,19 +521,7 @@ class _sparse_matmul(torch.autograd.Function):
         return c
 
     @staticmethod
-    def _dsd_matmul(a,
-                    b,
-                    trans_a,
-                    trans_b,
-                    trans_c,
-                    spdims,
-                    block,
-                    lut,
-                    num_locks,
-                    width,
-                    packs,
-                    bench,
-                    time):
+    def _dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, num_locks, width, packs, bench, time):
         global triton
         if triton is None:
             triton = importlib.import_module('triton')
@@ -621,16 +537,7 @@ class _sparse_matmul(torch.autograd.Function):
         dtype = a.dtype
         # kernel
 
-        meta = {
-            'TM': block,
-            'TN': 128,
-            'TK': 16,
-            'BLOCK': block,
-            'TZ': 1,
-            'SDD': False,
-            'DSD': True,
-            'DDS': False
-        }
+        meta = {'TM': block, 'TN': 128, 'TK': 16, 'BLOCK': block, 'TZ': 1, 'SDD': False, 'DSD': True, 'DDS': False}
         # output
         CS0 = BS0
         CS1 = BS1
@@ -665,53 +572,14 @@ class _sparse_matmul(torch.autograd.Function):
                       **meta)
         return c
 
-    fn = {
-        'sdd': _sdd_matmul.__get__(object),
-        'dsd': _dsd_matmul.__get__(object),
-        'dds': _dds_matmul.__get__(object)
-    }
+    fn = {'sdd': _sdd_matmul.__get__(object), 'dsd': _dsd_matmul.__get__(object), 'dds': _dds_matmul.__get__(object)}
 
     @staticmethod
-    def forward(ctx,
-                a,
-                b,
-                trans_a,
-                trans_b,
-                trans_c,
-                mode,
-                spdims,
-                block,
-                c_lut,
-                c_num_locks,
-                c_width,
-                c_packs,
-                c_bench,
-                c_time,
-                da_lut,
-                da_num_locks,
-                da_width,
-                da_packs,
-                da_bench,
-                da_time,
-                db_lut,
-                db_num_locks,
-                db_width,
-                db_packs,
-                db_bench,
-                db_time):
-        c = _sparse_matmul.fn[mode](a,
-                                    b,
-                                    trans_a,
-                                    trans_b,
-                                    trans_c,
-                                    spdims,
-                                    block,
-                                    c_lut,
-                                    c_num_locks,
-                                    c_width,
-                                    c_packs,
-                                    c_bench,
-                                    c_time)
+    def forward(ctx, a, b, trans_a, trans_b, trans_c, mode, spdims, block, c_lut, c_num_locks, c_width, c_packs,
+                c_bench, c_time, da_lut, da_num_locks, da_width, da_packs, da_bench, da_time, db_lut, db_num_locks,
+                db_width, db_packs, db_bench, db_time):
+        c = _sparse_matmul.fn[mode](a, b, trans_a, trans_b, trans_c, spdims, block, c_lut, c_num_locks, c_width,
+                                    c_packs, c_bench, c_time)
         # save for backward
         ctx.save_for_backward(a, b)
         ctx.da_num_locks = da_num_locks
@@ -741,34 +609,14 @@ class _sparse_matmul(torch.autograd.Function):
         # gradients w.r.t. a
         if ctx.needs_input_grad[0]:
             mode_da = mode[1] + mode[0] + mode[2]
-            da = _sparse_matmul.fn[mode_da](dc,
-                                            b,
-                                            False,
-                                            not ctx.trans_b,
-                                            ctx.trans_a,
-                                            ctx.spdims,
-                                            ctx.block,
-                                            ctx.da_lut,
-                                            ctx.da_num_locks,
-                                            ctx.da_width,
-                                            ctx.da_packs,
-                                            ctx.da_bench,
+            da = _sparse_matmul.fn[mode_da](dc, b, False, not ctx.trans_b, ctx.trans_a, ctx.spdims, ctx.block,
+                                            ctx.da_lut, ctx.da_num_locks, ctx.da_width, ctx.da_packs, ctx.da_bench,
                                             ctx.da_time)
         # gradients w.r.t. b
         if ctx.needs_input_grad[1]:
             mode_db = mode[2] + mode[1] + mode[0]
-            db = _sparse_matmul.fn[mode_db](a,
-                                            dc,
-                                            not ctx.trans_a,
-                                            False,
-                                            ctx.trans_b,
-                                            ctx.spdims,
-                                            ctx.block,
-                                            ctx.db_lut,
-                                            ctx.db_num_locks,
-                                            ctx.db_width,
-                                            ctx.db_packs,
-                                            ctx.db_bench,
+            db = _sparse_matmul.fn[mode_db](a, dc, not ctx.trans_a, False, ctx.trans_b, ctx.spdims, ctx.block,
+                                            ctx.db_lut, ctx.db_num_locks, ctx.db_width, ctx.db_packs, ctx.db_bench,
                                             ctx.db_time)
         return da, db, None, None, None,\
                None, None, None, None,\
@@ -785,6 +633,7 @@ class MatMul:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
+
     def make_lut(self, dtype, device):
         """Generates the sparsity layout/s used in block-sparse matmul
         """
@@ -797,21 +646,25 @@ class MatMul:
         if self.mode == 'sdd':
             c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         elif self.mode == 'dsd':
-            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a, device)
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_a,
+                                                                               device)
         elif self.mode == 'dds':
-            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b, device)
+            c_lut, c_num_locks, c_width, c_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_b,
+                                                                               device)
         # DA look-up table
         if self.mode == 'sdd':
             da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, True, device)
         elif self.mode == 'dsd':
             da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         elif self.mode == 'dds':
-            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step, not self.trans_b, device)
+            da_lut, da_num_locks, da_width, da_packs = _sparse_matmul.make_dxx_lut(layout, block, step,
+                                                                                   not self.trans_b, device)
         # DB look-up table
         if self.mode == 'sdd':
             db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, False, device)
         elif self.mode == 'dsd':
-            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a, device)
+            db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_dxx_lut(layout, block, step, self.trans_a,
+                                                                                   device)
         elif self.mode == 'dds':
             db_lut, db_num_locks, db_width, db_packs = _sparse_matmul.make_sdd_lut(layout, block, dtype, device)
         self.lut_cache[key] = (c_lut, c_num_locks, c_width, c_packs,\
@@ -845,11 +698,10 @@ class MatMul:
         assert layout_dim in (2, 3), "Layout should be a 2 or 3 dimensional tensor of 0s and 1s"
         if not mode == 'sdd':
             # Dims to be reduced on the 'inside' of the matmul, either -1 or -2
-            trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b, -2)
-            self.dense_inner_dim = -(
-                (sparse_inner % 2) + 1) if not trans_dense else sparse_inner
-            sparse_inner = sparse_inner if not trans_sparse else -(
-                (sparse_inner % 2) + 1)
+            trans_dense, trans_sparse, sparse_inner = (trans_b, trans_a, -1) if mode == 'dsd' else (trans_a, trans_b,
+                                                                                                    -2)
+            self.dense_inner_dim = -((sparse_inner % 2) + 1) if not trans_dense else sparse_inner
+            sparse_inner = sparse_inner if not trans_sparse else -((sparse_inner % 2) + 1)
 
             # Inner dim of the dense input should be equal to the inner dim of the sparse input
             self.dense_inner_size = layout.shape[sparse_inner] * block
@@ -860,8 +712,7 @@ class MatMul:
         if layout_dim == 2:
             layout = layout.unsqueeze(0)
 
-        layout = layout.long(
-        )  # Above code assumes the layout tensor is an integral type
+        layout = layout.long()  # Above code assumes the layout tensor is an integral type
 
         self.spdims = layout.shape
         # timings
@@ -909,31 +760,9 @@ class MatMul:
         b = MatMul._pad_shape(b, self.mode == 'dds')
         # execute
 
-        c = _sparse_matmul.apply(a,
-                                 b,
-                                 self.trans_a,
-                                 self.trans_b,
-                                 False,
-                                 self.mode,
-                                 self.spdims,
-                                 self.block,
-                                 c_lut,
-                                 c_num_locks,
-                                 c_width,
-                                 c_packs,
-                                 self.bench,
-                                 time_c,
-                                 da_lut,
-                                 da_num_locks,
-                                 da_width,
-                                 da_packs,
-                                 self.bench,
-                                 time_da,
-                                 db_lut,
-                                 db_num_locks,
-                                 db_width,
-                                 db_packs,
-                                 self.bench,
+        c = _sparse_matmul.apply(a, b, self.trans_a, self.trans_b, False, self.mode, self.spdims, self.block, c_lut,
+                                 c_num_locks, c_width, c_packs, self.bench, time_c, da_lut, da_num_locks, da_width,
+                                 da_packs, self.bench, time_da, db_lut, db_num_locks, db_width, db_packs, self.bench,
                                  time_db)
 
         # This removes any leading singleton dimensions we may have added to the tensor that weren't in the input
@@ -948,9 +777,8 @@ class MatMul:
 
     def _validate_inputs(self, a, b):
         if a.device != b.device:
-            raise ValueError(
-                f"Inputs must be on the same device; got {a.device} for tensor A "
-                f"and {b.device} for tensor B")
+            raise ValueError(f"Inputs must be on the same device; got {a.device} for tensor A "
+                             f"and {b.device} for tensor B")
         if not get_accelerator().on_accelerator(a):
             raise ValueError("Only GPU devices are supported for now")
 
@@ -958,9 +786,7 @@ class MatMul:
         if torch.is_autocast_enabled():
             a, b = a.half(), b.half()
         elif a.dtype != b.dtype:
-            raise ValueError(
-                f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B"
-            )
+            raise ValueError(f"Inputs must be the same dtype; got {a.dtype} for A and {b.dtype} for B")
 
         mode, trans_a, trans_b = self.mode, self.trans_a, self.trans_b
         if mode != 'sdd':
@@ -968,14 +794,12 @@ class MatMul:
             dense, dense_name, sparse, sparse_name = (a, 'A', b, 'B') if mode == 'dds' else (b, 'B', a, 'A')
             dense_inner = dense.shape[self.dense_inner_dim]
             if dense_inner != self.dense_inner_size:
-                raise ValueError(
-                    f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim "
-                    f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.")
+                raise ValueError(f"Expected tensor {dense_name} to have size {self.dense_inner_size} at dim "
+                                 f"{self.dense_inner_dim % dense.ndim}, got {dense_inner}.")
 
             if sparse.shape[-len(self.sparse_shape):] != self.sparse_shape:
-                raise ValueError(
-                    f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument "
-                    f"{sparse_name}, got {sparse.shape}")
+                raise ValueError(f"Expected tensor with trailing dimensions of shape {self.sparse_shape} for argument "
+                                 f"{sparse_name}, got {sparse.shape}")
 
         def add_extra_dims(x):
             # Add extra leading singleton dimensions if needed
@@ -984,8 +808,7 @@ class MatMul:
                 singletons = [1] * dims_needed
                 x = x.view(*singletons, *x.shape)
             elif dims_needed < 0:
-                raise ValueError(
-                    "Tensors with more than 4 dimensions are not currently supported")
+                raise ValueError("Tensors with more than 4 dimensions are not currently supported")
 
             return x
 
diff --git a/deepspeed/ops/sparse_attention/softmax.py b/deepspeed/ops/sparse_attention/softmax.py
index 09560e103d16b9eaa456e47d3cb5c701bcae21e3..debee5688fe3f06699a710dbaa35c09601189a4f 100755
--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
@@ -28,29 +31,11 @@ def num_warps(n):
     return 16
 
 
-@triton.heuristics({
-    'num_warps': lambda *args,
-    **meta: num_warps(args[6] * meta['BLOCK'])
-})
-@triton.heuristics({
-    'TN': lambda *args,
-    **meta: next_power_of_2(args[6] * meta['BLOCK'])
-})
+@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[6] * meta['BLOCK'])})
+@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[6] * meta['BLOCK'])})
 @triton.jit
-def _forward(X,
-             scale,
-             LUT,
-             RPE,
-             KP_M,
-             ATTN_M,
-             sizemax,
-             stride_zx,
-             stride_zrpe,
-             stride_hrpe,
-             stride_srpe,
-             stride_zkpm,
-             stride_zattnm,
-             **meta):
+def _forward(X, scale, LUT, RPE, KP_M, ATTN_M, sizemax, stride_zx, stride_zrpe, stride_hrpe, stride_srpe, stride_zkpm,
+             stride_zattnm, **meta):
     TN = meta['TN']
     BLOCK = meta['BLOCK']
     pidhm = tl.program_id(0)
@@ -102,14 +87,8 @@ def _forward(X,
     tl.store(px, x, mask=check)
 
 
-@triton.heuristics({
-    'num_warps': lambda *args,
-    **meta: num_warps(args[4] * meta['BLOCK'])
-})
-@triton.heuristics({
-    'TN': lambda *args,
-    **meta: next_power_of_2(args[4]) * meta['BLOCK']
-})
+@triton.heuristics({'num_warps': lambda *args, **meta: num_warps(args[4] * meta['BLOCK'])})
+@triton.heuristics({'TN': lambda *args, **meta: next_power_of_2(args[4]) * meta['BLOCK']})
 @triton.jit
 def _backward(X, scale, DX, LUT, sizemax, stride_zx, stride_zdx, **meta):
     pidhm = tl.program_id(0)
@@ -168,21 +147,8 @@ class _sparse_softmax(torch.autograd.Function):
         return lut, int(sizes.max())
 
     @staticmethod
-    def forward(ctx,
-                x,
-                scale,
-                rpe,
-                key_padding_mask,
-                attn_mask,
-                kp_mask_mode,
-                attn_mask_mode,
-                spdims,
-                block,
-                lut,
-                num_blocks,
-                maxlut,
-                bench,
-                time):
+    def forward(ctx, x, scale, rpe, key_padding_mask, attn_mask, kp_mask_mode, attn_mask_mode, spdims, block, lut,
+                num_blocks, maxlut, bench, time):
 
         apply_scale = False if scale == 1.0 else True
 
@@ -251,14 +217,7 @@ class _sparse_softmax(torch.autograd.Function):
         # run kernel
         M = x.shape[0]
         grid = lambda opt: [ctx.spdims[0] * ctx.spdims[1] * ctx.block, M]
-        _backward[grid](x,
-                        ctx.scale,
-                        dx,
-                        lut,
-                        ctx.maxlut,
-                        x.stride(0),
-                        dx.stride(0),
-                        BLOCK=ctx.block)
+        _backward[grid](x, ctx.scale, dx, lut, ctx.maxlut, x.stride(0), dx.stride(0), BLOCK=ctx.block)
         return dx, None, None, None, None, None, None, None, None, None, None, None, None, None, None
 
 
@@ -270,6 +229,7 @@ class Softmax:
 
     For more details about sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509
     """
+
     def sparse_softmax(*args, **kwargs):
         return _sparse_softmax.apply(*args, **kwargs)
 
@@ -278,9 +238,7 @@ class Softmax:
         """
         key = (device, )
         if key not in self.lut_cache:
-            self.lut_cache[key] = _sparse_softmax.make_lut(self.layout,
-                                                           self.block,
-                                                           device)
+            self.lut_cache[key] = _sparse_softmax.make_lut(self.layout, self.block, device)
         return self.lut_cache[key]
 
     def __init__(self, layout, block, bench=False):
@@ -332,19 +290,7 @@ class Softmax:
         if key_padding_mask is not None and key_padding_mask.dtype != x.dtype:
             raise ValueError('Key padding mask must be %s' % x.dtype)
         lut, maxlut = self.make_lut(x.device)
-        x = Softmax.sparse_softmax(x,
-                                   scale,
-                                   rpe,
-                                   key_padding_mask,
-                                   attn_mask,
-                                   key_padding_mask_mode,
-                                   attn_mask_mode,
-                                   self.spdims,
-                                   self.block,
-                                   lut,
-                                   self.num_blocks,
-                                   maxlut,
-                                   self.bench,
-                                   time_y)
+        x = Softmax.sparse_softmax(x, scale, rpe, key_padding_mask, attn_mask, key_padding_mask_mode, attn_mask_mode,
+                                   self.spdims, self.block, lut, self.num_blocks, maxlut, self.bench, time_y)
         self.time_y = time_y[0]
         return x
diff --git a/deepspeed/ops/sparse_attention/sparse_attention_utils.py b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
index 90edb10fd84a9835e5afe3b0c15cf3e6491a462f..ccb0f940dff65839beac579f81c4dfb7e499e6bb 100644
--- a/deepspeed/ops/sparse_attention/sparse_attention_utils.py
+++ b/deepspeed/ops/sparse_attention/sparse_attention_utils.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch.nn import functional as F
@@ -15,6 +16,7 @@ class SparseAttentionUtils:
     Such utilities include extending position embeddings, replacing current self-attention layer with sparse attention, padding sequences to multiple of block size, etc.
 
     """
+
     @staticmethod
     def extend_position_embedding(model, max_position):
         """This function extends the position embedding weights of a model loaded from a checkpoint.
@@ -28,13 +30,11 @@ class SparseAttentionUtils:
         """
 
         if hasattr(model, 'bert'):
-            original_max_position = model.bert.embeddings.position_embeddings.weight.size(
-                0)
+            original_max_position = model.bert.embeddings.position_embeddings.weight.size(0)
             assert max_position > original_max_position
             extend_multiples = max(1, max_position // original_max_position)
             model.bert.embeddings.position_embeddings.weight.data = model.bert.embeddings.position_embeddings.weight.repeat(
-                extend_multiples,
-                1)
+                extend_multiples, 1)
         elif hasattr(model, 'roberta'):
             # RoBERTa has positions 0 & 1 reserved, so embedding size is max position + 2
             original_max_position, embed_size = model.roberta.embeddings.position_embeddings.weight.shape
@@ -43,13 +43,11 @@ class SparseAttentionUtils:
             assert max_position > original_max_position
             max_position += 2
             extended_position_embedding = model.roberta.embeddings.position_embeddings.weight.new_empty(
-                max_position,
-                embed_size)
+                max_position, embed_size)
             k = 2
             for i in range(extend_multiples):
                 extended_position_embedding[k:(
-                    k + original_max_position
-                )] = model.roberta.embeddings.position_embeddings.weight[2:]
+                    k + original_max_position)] = model.roberta.embeddings.position_embeddings.weight[2:]
                 k += original_max_position
             model.roberta.embeddings.position_embeddings.weight.data = extended_position_embedding
         else:
@@ -58,9 +56,7 @@ class SparseAttentionUtils:
             )
 
         model.config.max_position_embeddings = max_position
-        print(
-            f'Extended position embeddings to {original_max_position * extend_multiples}'
-        )
+        print(f'Extended position embeddings to {original_max_position * extend_multiples}')
 
         return model
 
@@ -102,21 +98,17 @@ class SparseAttentionUtils:
 
         if hasattr(model, 'bert'):
             model.config.max_position_embeddings = max_position
-            model.replace_self_attention_layer_with_sparse_self_attention_layer(
-                model.config,
-                model.bert.encoder.layer,
-                sparsity_config)
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config, model.bert.encoder.layer,
+                                                                                sparsity_config)
         elif hasattr(model, 'roberta'):
             model.config.max_position_embeddings = max_position + 2
-            model.replace_self_attention_layer_with_sparse_self_attention_layer(
-                model.config,
-                model.roberta.encoder.layer,
-                sparsity_config)
+            model.replace_self_attention_layer_with_sparse_self_attention_layer(model.config,
+                                                                                model.roberta.encoder.layer,
+                                                                                sparsity_config)
         else:
             raise ValueError(
                 'Please extend \"update_model_self_attention_to_sparse_self_attention\" function to support \
-                                     your model type. It currently only supports \"bert\" & \"roberta\"!'
-            )
+                                     your model type. It currently only supports \"bert\" & \"roberta\"!')
         return model
 
     @staticmethod
@@ -148,14 +140,8 @@ class SparseAttentionUtils:
         return layers
 
     @staticmethod
-    def pad_to_block_size(block_size,
-                          input_ids,
-                          attention_mask,
-                          token_type_ids,
-                          position_ids,
-                          inputs_embeds,
-                          pad_token_id,
-                          model_embeddings):
+    def pad_to_block_size(block_size, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds,
+                          pad_token_id, model_embeddings):
         """This function pads input tokens and attention mask on sequence length dimension to be multiple of block size.
             This is a requirement for Sparse Transformer in which the self attention layer works on sequences of length multiple of block size.
             It needs to be called in your model, such as BertModel, right before you calculate the embedding outputs.
@@ -187,10 +173,7 @@ class SparseAttentionUtils:
         pad_len = (block_size - seq_len % block_size) % block_size
         if pad_len > 0:
             if inputs_embeds is not None:
-                pad_input_ids = inputs_embeds.new_full((batch_size,
-                                                        pad_len),
-                                                       pad_token_id,
-                                                       dtype=torch.long)
+                pad_input_ids = inputs_embeds.new_full((batch_size, pad_len), pad_token_id, dtype=torch.long)
                 pad_inputs_embeds = model_embeddings(pad_input_ids)
                 inputs_embeds = torch.cat([inputs_embeds, pad_inputs_embeds], dim=-2)
             # may not be needed as input_ids are not used if inputs_embeds are given
diff --git a/deepspeed/ops/sparse_attention/sparse_self_attention.py b/deepspeed/ops/sparse_attention/sparse_self_attention.py
index 46dedd0779f9e4e67d4a69f392bf41c5f33caa2f..b673c4561902e943981ca3008fae53ec73c0cd73 100644
--- a/deepspeed/ops/sparse_attention/sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/sparse_self_attention.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch.nn as nn
 import torch
@@ -15,6 +16,7 @@ class SparseSelfAttention(nn.Module):
 
     For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial.
     """
+
     def __init__(
             self,
             # SparsityConfig parameters needs to be set accordingly
@@ -53,8 +55,7 @@ class SparseSelfAttention(nn.Module):
 
         if (L % self.sparsity_config.block != 0):
             raise ValueError(
-                f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!'
-            )
+                f'Sequence Length, {L}, needs to be dividable by Block size {self.sparsity_config.block}!')
 
         num_blocks = L // self.sparsity_config.block
         return self.master_layout[..., :num_blocks, :num_blocks].cpu()  # layout needs to be a CPU tensor
@@ -65,11 +66,7 @@ class SparseSelfAttention(nn.Module):
         from deepspeed.ops.sparse_attention.softmax import Softmax
         if L not in SparseSelfAttention.ops:
             sparsity_layout = self.get_layout(L)
-            sparse_dot_sdd_nt = MatMul(sparsity_layout,
-                                       self.sparsity_config.block,
-                                       'sdd',
-                                       trans_a=False,
-                                       trans_b=True)
+            sparse_dot_sdd_nt = MatMul(sparsity_layout, self.sparsity_config.block, 'sdd', trans_a=False, trans_b=True)
 
             sparse_dot_dsd_nn = MatMul(sparsity_layout,
                                        self.sparsity_config.block,
@@ -79,9 +76,7 @@ class SparseSelfAttention(nn.Module):
 
             sparse_softmax = Softmax(sparsity_layout, self.sparsity_config.block)
 
-            SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt,
-                                          sparse_dot_dsd_nn,
-                                          sparse_softmax)
+            SparseSelfAttention.ops[L] = (sparse_dot_sdd_nt, sparse_dot_dsd_nn, sparse_softmax)
         return SparseSelfAttention.ops[L]
 
     def transpose_key_for_scores(self, x, L):
@@ -100,13 +95,7 @@ class SparseSelfAttention(nn.Module):
         return x.squeeze()
 
     # forward pass
-    def forward(self,
-                query,
-                key,
-                value,
-                rpe=None,
-                key_padding_mask=None,
-                attn_mask=None):
+    def forward(self, query, key, value, rpe=None, key_padding_mask=None, attn_mask=None):
         """Applies forward phase of sparse self attention
 
         Arguments:
@@ -134,9 +123,7 @@ class SparseSelfAttention(nn.Module):
 
         # squeeze key_padding_mask if it is given
         if key_padding_mask is not None:
-            key_padding_mask = self.transpose_mask_for_sparse(query.dtype,
-                                                              key_padding_mask,
-                                                              is_key_padding_mask=True)
+            key_padding_mask = self.transpose_mask_for_sparse(query.dtype, key_padding_mask, is_key_padding_mask=True)
 
         # squeeze attn_mask if it is given
         if attn_mask is not None:
@@ -149,14 +136,13 @@ class SparseSelfAttention(nn.Module):
 
         # attention scores
         attn_output_weights = sparse_dot_sdd_nt(query, key)
-        attn_output_weights = sparse_softmax(
-            attn_output_weights,
-            scale=scaling,
-            rpe=rpe,
-            key_padding_mask=key_padding_mask,
-            attn_mask=attn_mask,
-            key_padding_mask_mode=self.key_padding_mask_mode,
-            attn_mask_mode=self.attn_mask_mode)
+        attn_output_weights = sparse_softmax(attn_output_weights,
+                                             scale=scaling,
+                                             rpe=rpe,
+                                             key_padding_mask=key_padding_mask,
+                                             attn_mask=attn_mask,
+                                             key_padding_mask_mode=self.key_padding_mask_mode,
+                                             attn_mask_mode=self.attn_mask_mode)
 
         # outputs
         attn_output = sparse_dot_dsd_nn(attn_output_weights, value)
diff --git a/deepspeed/ops/sparse_attention/sparsity_config.py b/deepspeed/ops/sparse_attention/sparsity_config.py
index efdad2e4d7e534c36b29dc1ddbaea87a69b97354..7b7d5f50133cf0f5a92490865a12dff286da9a42 100644
--- a/deepspeed/ops/sparse_attention/sparsity_config.py
+++ b/deepspeed/ops/sparse_attention/sparsity_config.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import random
@@ -10,6 +11,7 @@ class SparsityConfig:
     """Abstract Configuration class to store `sparsity configuration of a self attention layer`.
     It contains shared property of different block-sparse sparsity patterns. However, each class needs to extend it based on required property and functionality.
     """
+
     def __init__(self, num_heads, block=16, different_layout_per_head=False):
         """Initialize the Sparsity Pattern Config.
 
@@ -37,9 +39,7 @@ class SparsityConfig:
         """
 
         if (seq_len % self.block != 0):
-            raise ValueError(
-                f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!'
-            )
+            raise ValueError(f'Sequence Length, {seq_len}, needs to be dividable by Block size {self.block}!')
         num_blocks = seq_len // self.block
         # TODO Currently we allocate layout per head; needs to be updated if heads share a single layout.
         layout = torch.zeros((self.num_heads, num_blocks, num_blocks), dtype=torch.int64)
@@ -64,6 +64,7 @@ class DenseSparsityConfig(SparsityConfig):
     """Configuration class to store `Dense` configuration.
     In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
     """
+
     def __init__(self, num_heads, block=16, different_layout_per_head=False):
         """Initialize the Dense Sparsity Pattern Config.
         In reality, this is not sparse and all blocks are used. We keep it for the sake of comparison and comprehension.
@@ -96,6 +97,7 @@ class FixedSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
     This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -131,14 +133,11 @@ class FixedSparsityConfig(SparsityConfig):
         self.num_global_blocks = num_global_blocks
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
         if (attention != 'bidirectional' and horizontal_global_attention):
-            raise ValueError(
-                'only \"bi-directional\" attentions can support horizontal global attention!'
-            )
+            raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!')
         self.horizontal_global_attention = horizontal_global_attention
 
         if (num_different_global_patterns > 1 and not different_layout_per_head):
@@ -166,9 +165,7 @@ class FixedSparsityConfig(SparsityConfig):
         for i in range(0, num_blocks, self.num_local_blocks):
             end = min(i + self.num_local_blocks, num_blocks)
             for row in range(i, end):
-                for col in range(
-                        i,
-                    (row + 1 if self.attention == 'unidirectional' else end)):
+                for col in range(i, (row + 1 if self.attention == 'unidirectional' else end)):
                     layout[h, row, col] = 1
         return layout
 
@@ -206,8 +203,7 @@ class FixedSparsityConfig(SparsityConfig):
 
         # set last global blocks; handle possible short last local window
         if (end < num_blocks):
-            start = min(end + first_global_block_idx,
-                        num_blocks - self.num_global_blocks)
+            start = min(end + first_global_block_idx, num_blocks - self.num_global_blocks)
             end = start + self.num_global_blocks
 
             # vertical global attention
@@ -250,6 +246,7 @@ class VariableSparsityConfig(SparsityConfig):
     For more details about `Fixed` sparsity config, please see `Generative Modeling with Sparse Transformers`: https://arxiv.org/abs/1904.10509; this has been customized.
     This class extends parent class of `SparsityConfig` and customizes it for `Fixed` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -296,14 +293,11 @@ class VariableSparsityConfig(SparsityConfig):
         self.global_block_end_indices = global_block_end_indices
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
         if (attention != 'bidirectional' and horizontal_global_attention):
-            raise ValueError(
-                'only \"bi-directional\" attentions can support horizontal global attention!'
-            )
+            raise ValueError('only \"bi-directional\" attentions can support horizontal global attention!')
         self.horizontal_global_attention = horizontal_global_attention
 
     def set_random_layout(self, h, layout):
@@ -345,9 +339,7 @@ class VariableSparsityConfig(SparsityConfig):
             end_block_idx += block_size
             end_block_idx = min(end_block_idx, num_blocks)
             for row in range(start_block_idx, end_block_idx):
-                for col in range(
-                        start_block_idx,
-                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                for col in range(start_block_idx, (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
                     layout[h, row, col] = 1
             start_block_idx += block_size
 
@@ -355,9 +347,7 @@ class VariableSparsityConfig(SparsityConfig):
         for i in range(start_block_idx, num_blocks, block_size):
             end_block_idx = min(i + block_size, num_blocks)
             for row in range(i, end_block_idx):
-                for col in range(
-                        i,
-                    (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
+                for col in range(i, (row + 1 if self.attention == 'unidirectional' else end_block_idx)):
                     layout[h, row, col] = 1
         return layout
 
@@ -423,6 +413,7 @@ class BigBirdSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Big Bird: Transformers for Longer Sequences`: https://arxiv.org/pdf/2007.14062.pdf
     This class extends parent class of `SparsityConfig` and customizes it for `BigBird` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -452,8 +443,7 @@ class BigBirdSparsityConfig(SparsityConfig):
         self.num_global_blocks = num_global_blocks
 
         if (attention != 'unidirectional' and attention != 'bidirectional'):
-            raise NotImplementedError(
-                'only \"uni/bi-directional\" attentions are supported for now!')
+            raise NotImplementedError('only \"uni/bi-directional\" attentions are supported for now!')
         self.attention = attention
 
     def set_random_layout(self, h, layout):
@@ -475,10 +465,7 @@ class BigBirdSparsityConfig(SparsityConfig):
             )
 
         for row in range(0, num_blocks):
-            sample_range = range(
-                0,
-                num_blocks) if self.attention == 'bidirectional' else range(0,
-                                                                            row + 1)
+            sample_range = range(0, num_blocks) if self.attention == 'bidirectional' else range(0, row + 1)
             rnd_cols = random.sample(sample_range, self.num_random_blocks)
             layout[h, row, rnd_cols] = 1
         return layout
@@ -564,6 +551,7 @@ class BSLongformerSparsityConfig(SparsityConfig):
     For more details about this sparsity config, please see `Longformer: The Long-Document Transformer`: https://arxiv.org/pdf/2004.05150.pdf
     This class extends parent class of `SparsityConfig` and customizes it for `Longformer` sparsity.
     """
+
     def __init__(self,
                  num_heads,
                  block=16,
@@ -687,11 +675,8 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig):
     """Configuration class to store `Local Sliding Window` sparsity configuration - a purely-local sliding window attention.
     This class extends parent class of `SparsityConfig` and customizes it for `Local` sparsity.
     """
-    def __init__(self,
-                 num_heads,
-                 block=16,
-                 num_sliding_window_blocks=3,
-                 attention='unidirectional'):
+
+    def __init__(self, num_heads, block=16, num_sliding_window_blocks=3, attention='unidirectional'):
         """Initialize the Local Sliding Window Sparsity Pattern Config.
         For usage example please see, TODO DeepSpeed Sparse Transformer Tutorial
         Arguments:
@@ -723,8 +708,7 @@ class LocalSlidingWindowSparsityConfig(SparsityConfig):
         w = self.num_sliding_window_blocks // 2
         for row in range(0, num_blocks):
             start = max(0, row - w)
-            end = min(row + w + 1,
-                      num_blocks) if self.attention == "bidirectional" else row + 1
+            end = min(row + w + 1, num_blocks) if self.attention == "bidirectional" else row + 1
             layout[h, row, start:end] = 1
         return layout
 
diff --git a/deepspeed/ops/sparse_attention/trsrc/__init__.py b/deepspeed/ops/sparse_attention/trsrc/__init__.py
index b21068e98f8e06b5188de0dc3ed8985003abb39f..18d58b5af4bf387ffc8b4e90645e3bd578ee10fe 100644
--- a/deepspeed/ops/sparse_attention/trsrc/__init__.py
+++ b/deepspeed/ops/sparse_attention/trsrc/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import os
diff --git a/deepspeed/ops/sparse_attention/trsrc/matmul.tr b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
index bf87e993feda95e357824ed93f49ecb719cd838a..9be6aa2aed7ccfb0d438d2e4d766d30b21d8f744 100644
--- a/deepspeed/ops/sparse_attention/trsrc/matmul.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/matmul.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
+*/
 
 __global__ void NAME (TYPE* A __readonly  __noalias __aligned(16),
                         TYPE* B __readonly  __noalias __aligned(16),
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
index 25d15a99e46800aaba51589e3a5ebc5c4d19c40c..1a90f41d94945e1d6d6f52e6beaea94fa52cdda8 100644
--- a/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_bwd.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+*/
 
 __global__ void softmax_bwd(TYPE * X __readonly __noalias __aligned(16),
                             float scale,
diff --git a/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
index 7d5cc50b282d3b460add516774223fbe69afa9a4..ebd317d9469b47f7e2ee3032d3aabf57b5620a73 100644
--- a/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
+++ b/deepspeed/ops/sparse_attention/trsrc/softmax_fwd.tr
@@ -1,5 +1,12 @@
-// DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
-// https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+DeepSpeed note, code taken & adapted from commit 9aa94789f13ada713af36cfd8cca2fc9a7f6b79a
+ https:github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/softmax.py
+*/
 
 __global__ void softmax_fwd(TYPE *X __readonly __noalias __aligned(16),
                             float scale,
diff --git a/deepspeed/ops/transformer/__init__.py b/deepspeed/ops/transformer/__init__.py
index 77d666c869a977ee73fbdc33185dccb6565299d4..b80fe2b4ba714611b6bcb652d9e559d87c7ed6fd 100755
--- a/deepspeed/ops/transformer/__init__.py
+++ b/deepspeed/ops/transformer/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .inference.config import DeepSpeedInferenceConfig
diff --git a/deepspeed/ops/transformer/inference/__init__.py b/deepspeed/ops/transformer/inference/__init__.py
index 6e9ca0051e789d3d48573a4d246573caad1ea2f1..c8b31a90eac2850fa7554214760c00df9815db25 100644
--- a/deepspeed/ops/transformer/inference/__init__.py
+++ b/deepspeed/ops/transformer/inference/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .config import DeepSpeedInferenceConfig
 from ....model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
diff --git a/deepspeed/ops/transformer/inference/bias_add.py b/deepspeed/ops/transformer/inference/bias_add.py
index b7d2944fb5f8470198c0ab0cb7d84edf1f342023..253784f001aeb3431a0d60812e4d2068c0bd5455 100644
--- a/deepspeed/ops/transformer/inference/bias_add.py
+++ b/deepspeed/ops/transformer/inference/bias_add.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from typing import Optional
 import torch
@@ -22,7 +23,4 @@ def nhwc_bias_add(activation: torch.Tensor,
     elif other_bias is None:
         return spatial_cuda_module.nhwc_bias_add_add(activation, bias, other)
     else:
-        return spatial_cuda_module.nhwc_bias_add_bias_add(activation,
-                                                          bias,
-                                                          other,
-                                                          other_bias)
+        return spatial_cuda_module.nhwc_bias_add_bias_add(activation, bias, other, other_bias)
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index f0c87ba5619bf65f188a91f75f1676d7b49e2ce7..549a03a70f19f63c3381cfdab366289a93caf6a2 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -1,11 +1,14 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 from deepspeed.utils.types import ActivationFuncType
 
 
 class TransformerConfig():
+
     def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
         self.layer_id = -1
         self.hidden_size = hidden_size
@@ -40,6 +43,7 @@ class DeepSpeedInferenceConfig(TransformerConfig):
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
             bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
     """
+
     def __init__(self,
                  hidden_size=-1,
                  intermediate_size=-1,
@@ -65,16 +69,16 @@ class DeepSpeedInferenceConfig(TransformerConfig):
                  training_mp_size=1,
                  bigscience_bloom=False,
                  max_out_tokens=1024,
+                 min_out_tokens=1,
                  enable_qkv_quantization=False,
                  use_mup=False,
                  scale_attn_by_inverse_layer_idx=False,
-                 return_single_tuple=False):
+                 return_single_tuple=False,
+                 set_empty_params=False,
+                 transposed_mode=False):
         super(DeepSpeedInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers)
+              self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             num_hidden_layers)
         self.fp16 = fp16
         self.pre_layer_norm = pre_layer_norm
         self.local_rank = local_rank
@@ -96,10 +100,13 @@ class DeepSpeedInferenceConfig(TransformerConfig):
         self.training_mp_size = training_mp_size
         self.bigscience_bloom = bigscience_bloom
         self.max_out_tokens = max_out_tokens
+        self.min_out_tokens = min_out_tokens
         self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
         self.enable_qkv_quantization = enable_qkv_quantization
         self.use_mup = use_mup
         self.return_single_tuple = return_single_tuple
+        self.set_empty_params = set_empty_params
+        self.transposed_mode = transposed_mode
 
     @classmethod
     def from_dict(cls, json_object):
diff --git a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
index 246eab8676fe4e1a3bab1c05adefec06316e632e..fa4c6d53f871d326d0a51f2bc9ae7c71743befda 100644
--- a/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
+++ b/deepspeed/ops/transformer/inference/diffusers_2d_transformer.py
@@ -1,8 +1,10 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class Diffusers2DTransformerConfig():
+
     def __init__(self, int8_quantization=False):
         self.int8_quantization = int8_quantization
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
index 9d829ce3e0729d70e7a1a87a0012a844700f2992..3447f9516ade0bbeb47fb5f15b9b663abb751813 100644
--- a/deepspeed/ops/transformer/inference/diffusers_attention.py
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import math
 import torch
 from torch.autograd import Function
@@ -30,26 +32,12 @@ def load_triton_flash_attn():
 
 
 class DeepSpeedDiffusersAttentionFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                context,
-                input_mask,
-                config,
-                attn_qkvw,
-                attn_qw,
-                attn_kw,
-                attn_vw,
-                attn_qkvb,
-                num_attention_heads_per_partition,
-                norm_factor,
-                hidden_size_per_partition,
-                attn_ow,
-                attn_ob,
-                do_out_bias,
-                score_context_func,
-                linear_func,
-                triton_flash_attn_kernel):
+    def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
+                num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
+                do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel):
+
         def _transpose_for_context(x):
             x = x.permute(0, 2, 1, 3)
             new_x_layer_shape = x.size()[:-2] + \
@@ -58,8 +46,7 @@ class DeepSpeedDiffusersAttentionFunction(Function):
 
         def _transpose_for_scores(x):
             attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                           attention_head_size)
+            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition, attention_head_size)
             x = x.reshape(*new_x_shape)
             x = x.permute(0, 2, 1, 3)
             return x.contiguous()
@@ -71,19 +58,12 @@ class DeepSpeedDiffusersAttentionFunction(Function):
             do_flash_attn = (head_size <= 128)
             scale = (1 / norm_factor) * (1 / norm_factor)
             if do_flash_attn and context == None:
-                qkv_out = linear_func(input,
-                                      attn_qkvw,
-                                      attn_qkvb if attn_qkvb is not None else attn_qkvw,
-                                      attn_qkvb is not None,
-                                      do_flash_attn,
-                                      config.heads)
-
-                context_layer = triton_flash_attn_kernel(qkv_out[0],
-                                                         qkv_out[1],
-                                                         qkv_out[2],
-                                                         scale,
+                qkv_out = linear_func(input, attn_qkvw, attn_qkvb if attn_qkvb is not None else attn_qkvw, attn_qkvb
+                                      is not None, do_flash_attn, config.heads, False)
+
+                context_layer = triton_flash_attn_kernel(qkv_out[0], qkv_out[1], qkv_out[2], scale,
                                                          input.shape[-2] % 128 == 0)
-                context_layer = _transpose_for_context(context_layer[:,:,:,:head_size])
+                context_layer = _transpose_for_context(context_layer[:, :, :, :head_size])
 
             else:
                 do_flash_attn = False
@@ -97,21 +77,12 @@ class DeepSpeedDiffusersAttentionFunction(Function):
                     query = query.contiguous()
                     key = key.contiguous()
                     value = value.contiguous()
-                query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
-                attention_scores = (torch.matmul(query,
-                                                 key.transpose(-1,
-                                                               -2)) *
-                                    scale).softmax(dim=-1)
-                context_layer = _transpose_for_context(
-                    torch.matmul(attention_scores,
-                                 value))
-
-            output = linear_func(context_layer,
-                                 attn_ow,
-                                 attn_ob,
-                                 do_out_bias,
-                                 False,
-                                 config.heads)
+                query, key, value = inference_cuda_module.pad_transform_fp16(query, key, value, config.heads,
+                                                                             do_flash_attn)
+                attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
+                context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
+
+            output = linear_func(context_layer, attn_ow, attn_ob, do_out_bias, False, config.heads, False)
             return output
 
         output = selfAttention_fp(input, context, input_mask)
@@ -142,8 +113,7 @@ class DeepSpeedDiffusersAttention(nn.Module):
         self.config = config
         self.config.layer_id = DeepSpeedDiffusersAttention.layer_id
         DeepSpeedDiffusersAttention.layer_id += 1
-        device = get_accelerator().current_device_name(
-        ) if config.bigscience_bloom else 'cpu'
+        device = get_accelerator().current_device_name() if config.bigscience_bloom else 'cpu'
         qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
 
         data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
@@ -176,9 +146,7 @@ class DeepSpeedDiffusersAttention(nn.Module):
                                                 dtype=data_type,
                                                 device=device),
                                     requires_grad=False)
-        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
-                                                  dtype=data_type_fp,
-                                                  device=device),
+        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device),
                                       requires_grad=False)
         out_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
@@ -187,9 +155,7 @@ class DeepSpeedDiffusersAttention(nn.Module):
                                                 device=device),
                                     requires_grad=False)
 
-        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
+        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
                                     requires_grad=False)
         self.do_out_bias = True
 
@@ -200,8 +166,7 @@ class DeepSpeedDiffusersAttention(nn.Module):
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
         self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
 
-        self.norm_factor = math.sqrt(
-            math.sqrt(self.config.hidden_size // self.config.heads))
+        self.norm_factor = math.sqrt(math.sqrt(self.config.hidden_size // self.config.heads))
 
         if self.config.scale_attn_by_inverse_layer_idx is True:
             self.norm_factor *= math.sqrt(self.config.layer_id + 1)
@@ -216,33 +181,15 @@ class DeepSpeedDiffusersAttention(nn.Module):
 
     def forward(self, input, context=None, input_mask=None):
         if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size,
-                                    self.config.heads,
+            self.allocate_workspace(self.config.hidden_size, self.config.heads,
                                     input.size()[1],
-                                    input.size()[0],
-                                    DeepSpeedDiffusersAttention.layer_id,
-                                    self.config.mp_size,
-                                    False,
-                                    0,
-                                    self.config.max_out_tokens)
-        output = DeepSpeedDiffusersAttentionFunction.apply(
-            input,
-            context,
-            input_mask,
-            self.config,
-            self.attn_qkvw,
-            self.attn_qw,
-            self.attn_kw,
-            self.attn_vw,
-            self.attn_qkvb,
-            self.num_attention_heads_per_partition,
-            self.norm_factor,
-            self.hidden_size_per_partition,
-            self.attn_ow,
-            self.attn_ob,
-            self.do_out_bias,
-            self.score_context_func,
-            self.linear_func,
-            self.triton_flash_attn_kernel)
+                                    input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
+                                    0, self.config.max_out_tokens, self.config.min_out_tokens)
+        output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
+                                                           self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
+                                                           self.num_attention_heads_per_partition, self.norm_factor,
+                                                           self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
+                                                           self.do_out_bias, self.score_context_func, self.linear_func,
+                                                           self.triton_flash_attn_kernel)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
index e453c343e9f04119a7a508f8545249c3285f1aec..3d45714e543cfd1b673dc83213abd4cd92849673 100644
--- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
@@ -31,41 +32,30 @@ def load_spatial_module():
 
 
 class DeepSpeedDiffusersTransformerBlock(nn.Module):
-    def __init__(self,
-                 equivalent_module: nn.Module,
-                 config: Diffusers2DTransformerConfig):
+
+    def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerConfig):
         super(DeepSpeedDiffusersTransformerBlock, self).__init__()
         self.quantizer = module_inject.GroupQuantizer(q_int8=config.int8_quantization)
         # Ensure ops are built by the time we start running
         self.config = config
 
         self.ff1_w = self.quantizer.quantize(
-            nn.Parameter(equivalent_module.ff.net[0].proj.weight.data,
-                         requires_grad=False))
-        self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data,
-                                  requires_grad=False)
-        self.ff2_w = self.quantizer.quantize(
-            nn.Parameter(equivalent_module.ff.net[2].weight.data,
-                         requires_grad=False))
-        self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data,
-                                  requires_grad=False)
-
-        self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data,
-                                    requires_grad=False)
-        self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data,
-                                    requires_grad=False)
+            nn.Parameter(equivalent_module.ff.net[0].proj.weight.data, requires_grad=False))
+        self.ff1_b = nn.Parameter(equivalent_module.ff.net[0].proj.bias.data, requires_grad=False)
+        self.ff2_w = self.quantizer.quantize(nn.Parameter(equivalent_module.ff.net[2].weight.data,
+                                                          requires_grad=False))
+        self.ff2_b = nn.Parameter(equivalent_module.ff.net[2].bias.data, requires_grad=False)
+
+        self.norm1_g = nn.Parameter(equivalent_module.norm1.weight.data, requires_grad=False)
+        self.norm1_b = nn.Parameter(equivalent_module.norm1.bias.data, requires_grad=False)
         self.norm1_eps = equivalent_module.norm1.eps
 
-        self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data,
-                                    requires_grad=False)
-        self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data,
-                                    requires_grad=False)
+        self.norm2_g = nn.Parameter(equivalent_module.norm2.weight.data, requires_grad=False)
+        self.norm2_b = nn.Parameter(equivalent_module.norm2.bias.data, requires_grad=False)
         self.norm2_eps = equivalent_module.norm2.eps
 
-        self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data,
-                                    requires_grad=False)
-        self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data,
-                                    requires_grad=False)
+        self.norm3_g = nn.Parameter(equivalent_module.norm3.weight.data, requires_grad=False)
+        self.norm3_b = nn.Parameter(equivalent_module.norm3.bias.data, requires_grad=False)
         self.norm3_eps = equivalent_module.norm3.eps
 
         self.attn_1 = equivalent_module.attn1
@@ -76,16 +66,14 @@ class DeepSpeedDiffusersTransformerBlock(nn.Module):
             self.attn_1.do_out_bias = False
             self.attn_1_bias = self.attn_1.attn_ob
         else:
-            self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g),
-                                            requires_grad=False)
+            self.attn_1_bias = nn.Parameter(torch.zeros_like(self.norm2_g), requires_grad=False)
 
         # Pull the bias in if we can
         if isinstance(self.attn_2, DeepSpeedDiffusersAttention):
             self.attn_2.do_out_bias = False
             self.attn_2_bias = self.attn_2.attn_ob
         else:
-            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g),
-                                             requires_grad=False)
+            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
         self.transformer_cuda_module = load_transformer_module()
         load_spatial_module()
@@ -99,25 +87,14 @@ class DeepSpeedDiffusersTransformerBlock(nn.Module):
         if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] != None:
             context = kwargs["encoder_hidden_states"]
 
-        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states,
-                                                             self.norm1_g,
-                                                             self.norm1_b,
-                                                             self.norm1_eps)
+        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
         out_attn_1 = self.attn_1(out_norm_1)
 
-        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_1,
-                                                                 self.attn_1_bias,
-                                                                 hidden_states,
-                                                                 self.norm2_g,
-                                                                 self.norm2_b,
-                                                                 self.norm2_eps)
+        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+            out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
         out_attn_2 = self.attn_2(out_norm_2, context=context)
-        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(out_attn_2,
-                                                                 self.attn_2_bias,
-                                                                 out_attn_1,
-                                                                 self.norm3_g,
-                                                                 self.norm3_b,
-                                                                 self.norm3_eps)
+        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+            out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
 
         out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
         out_geglu = self.transformer_cuda_module.bias_geglu(out_ff1, self.ff1_b)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index f4ec14bf49906916e606e73f2c8d00a39971e730..d56f8bc7de4a959ecede5371195edea04eb17202 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import torch
@@ -14,6 +15,7 @@ minus_inf = -10000.0
 
 class DeepSpeedSelfAttention(nn.Module):
     num_layers = 0
+    _qkv_buffers = []
 
     def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1):
         super(DeepSpeedSelfAttention, self).__init__()
@@ -22,29 +24,36 @@ class DeepSpeedSelfAttention(nn.Module):
         data_type_fp = torch.half if config.fp16 else torch.float
         self.config.layer_id = DeepSpeedSelfAttention.num_layers
         DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
-        device = get_accelerator().current_device_name(
-        )  #if config.bigscience_bloom else 'cpu'
-        qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
-        self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                  qkv_size_per_partition,
-                                                  dtype=data_type,
-                                                  device=device),
-                                      requires_grad=False)
-        self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition,
-                                                  dtype=data_type_fp,
-                                                  device=device),
-                                      requires_grad=False)
-        out_size_per_partition = self.config.hidden_size // self.config.mp_size
-        self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
-                                                self.config.hidden_size,
-                                                dtype=data_type,
-                                                device=device),
-                                    requires_grad=False)
-
-        self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
+        device = get_accelerator().current_device_name()  #if config.bigscience_bloom else 'cpu'
+        if self.config.set_empty_params:
+            self.attn_qw = None
+            self.attn_qb = None
+            self.attn_kw = None
+            self.attn_kb = None
+            self.attn_vw = None
+            self.attn_vb = None
+            self.attn_qkvw = None
+            self.attn_qkvb = None
+            self.attn_ow = None
+            self.attn_ob = None
+        else:
+            qkv_size_per_partition = (self.config.hidden_size // self.config.mp_size) * 3
+            self.attn_qkvw = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                      qkv_size_per_partition,
+                                                      dtype=data_type,
+                                                      device=device),
+                                          requires_grad=False)
+            self.attn_qkvb = nn.Parameter(torch.empty(qkv_size_per_partition, dtype=data_type_fp, device=device),
+                                          requires_grad=False)
+            out_size_per_partition = self.config.hidden_size // self.config.mp_size
+            self.attn_ow = nn.Parameter(torch.empty(out_size_per_partition,
+                                                    self.config.hidden_size,
+                                                    dtype=data_type,
+                                                    device=device),
+                                        requires_grad=False)
+
+            self.attn_ob = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
 
         self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
@@ -69,6 +78,14 @@ class DeepSpeedSelfAttention(nn.Module):
         self.score_context_func = SoftmaxContextOp(config)
         self.linear_func = LinearOp(config)
         self.vector_matmul_func = VectorMatMulOp(config)
+        if len(DeepSpeedSelfAttention._qkv_buffers) == 0:
+            DeepSpeedSelfAttention._qkv_buffers = [
+                torch.empty(self.hidden_size_per_partition * 3,
+                            self.config.hidden_size,
+                            dtype=data_type_fp,
+                            device=device),
+                torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
+            ]
 
     def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
         if isinstance(qkv_out, list):
@@ -93,6 +110,18 @@ class DeepSpeedSelfAttention(nn.Module):
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
 
+    def _merge_qkv(self):
+        qvkw = DeepSpeedSelfAttention._qkv_buffers[0]
+        qvkw[:self.hidden_size_per_partition, :] = self.attn_qw
+        qvkw[self.hidden_size_per_partition:2 * self.hidden_size_per_partition, :] = self.attn_kw
+        qvkw[2 * self.hidden_size_per_partition:, :] = self.attn_vw
+        if self.attn_qb is not None:
+            qvkb = DeepSpeedSelfAttention._qkv_buffers[1]
+            qvkb[:self.hidden_size_per_partition] = self.attn_qb
+            qvkb[self.hidden_size_per_partition:2 * self.hidden_size_per_partition] = self.attn_kb
+            qvkb[2 * self.hidden_size_per_partition:] = self.attn_vb
+        return DeepSpeedSelfAttention._qkv_buffers
+
     def forward(self,
                 input,
                 input_mask,
@@ -105,44 +134,44 @@ class DeepSpeedSelfAttention(nn.Module):
                 norm_w=None,
                 norm_b=None,
                 alibi=None):
+        if self.attn_qkvw is None:
+            self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
+        else:
+            self._attn_qkvw = self.attn_qkvw
+            self._attn_qkvb = self.attn_qkvb
 
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
-                                       weight=self.attn_qkvw,
-                                       bias=self.attn_qkvb,
+                                       weight=self._attn_qkvw,
+                                       bias=self._attn_qkvb,
                                        add_bias=self.attn_qkvb is not None,
                                        do_flash_attn=False,
                                        num_heads=self.num_attention_heads_per_partition,
                                        num_layers=DeepSpeedSelfAttention.num_layers)
         else:
-            qkv_out = self.qkv_func(
-                input=input,
-                weight=self.attn_qkvw,
-                bias=(self.attn_qkvb if self.attn_qkvb is not None else norm_b),
-                gamma=norm_w,
-                beta=norm_b,
-                add_bias=(self.attn_qkvb is not None),
-                num_layers=DeepSpeedSelfAttention.num_layers,
-                num_heads=self.num_attention_heads_per_partition)
-
-        context_layer, key_layer, value_layer = self.compute_attention(
-            qkv_out=qkv_out,
-            input_mask=input_mask,
-            layer_past=layer_past,
-            alibi=alibi)
-
+            qkv_out = self.qkv_func(input=input,
+                                    weight=self._attn_qkvw,
+                                    bias=(self._attn_qkvb if self._attn_qkvb is not None else norm_b),
+                                    gamma=norm_w,
+                                    beta=norm_b,
+                                    add_bias=(self.attn_qkvb is not None),
+                                    num_layers=DeepSpeedSelfAttention.num_layers,
+                                    num_heads=self.num_attention_heads_per_partition)
+        context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
+                                                                       input_mask=input_mask,
+                                                                       layer_past=layer_past,
+                                                                       alibi=alibi)
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
-
         inp_norm = qkv_out[-1]
 
-        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(
-                group=self.mp_group) > 1:
+        if self.config.mlp_after_attn and self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
             dist.all_reduce(output, group=self.mp_group)
 
         return (output, key_layer, value_layer, context_layer, inp_norm)
 
 
 class BloomSelfAttention(DeepSpeedSelfAttention):
+
     def __init__(self, *args, **kwargs):
         super(BloomSelfAttention, self).__init__(*args, **kwargs)
         self.softmax_func = SoftmaxOp(self.config)
@@ -156,10 +185,7 @@ class BloomSelfAttention(DeepSpeedSelfAttention):
                                     (self.hidden_size_per_partition,)
         return x.view(*new_x_layer_shape).contiguous()
 
-    def _split_tensor_along_last_dim(self,
-                                     tensor,
-                                     num_partitions,
-                                     contiguous_split_chunks=True):
+    def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_chunks=True):
         """Split a tensor along its last dimension.
 
         Args:
@@ -196,64 +222,43 @@ class BloomSelfAttention(DeepSpeedSelfAttention):
         mixed_x_layer = qkv_out
         alibi = alibi.to(get_accelerator().current_device_name())
         head_dim = self.hidden_size_per_partition // self.num_attention_heads_per_partition
-        new_tensor_shape = mixed_x_layer.size()[:-1] + (
-            self.num_attention_heads_per_partition,
-            3 * head_dim)
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (self.num_attention_heads_per_partition, 3 * head_dim)
         mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
         query_layer, key_layer, value_layer = self._split_tensor_along_last_dim(mixed_x_layer, 3)
 
         # [batch_size, head_dim, q_length, k_length]
-        output_size = (query_layer.size(0),
-                       query_layer.size(2),
-                       query_layer.size(1),
-                       key_layer.size(1))
+        output_size = (query_layer.size(0), query_layer.size(2), query_layer.size(1), key_layer.size(1))
         # [batch_size, q_length, num_heads, head_dim] -> [q_length, batch_size * num_heads, head_dim]
-        query_layer = query_layer.transpose(1,
-                                            2).reshape(output_size[0] * output_size[1],
-                                                       output_size[2],
-                                                       -1)
+        query_layer = query_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[2], -1)
         # [batch_size, k_length, num_heads, head_dim] -> [k_length, batch_size * num_heads, head_dim]
-        key_layer = key_layer.transpose(1,
-                                        2).reshape(output_size[0] * output_size[1],
-                                                   output_size[3],
-                                                   -1).transpose(-1,
-                                                                 -2)
-        value_layer = value_layer.transpose(1,
-                                            2).reshape(output_size[0] * output_size[1],
-                                                       output_size[3],
-                                                       -1)
+        key_layer = key_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3],
+                                                      -1).transpose(-1, -2)
+        value_layer = value_layer.transpose(1, 2).reshape(output_size[0] * output_size[1], output_size[3], -1)
         if layer_past is not None:
             past_key, past_value = layer_past
             # concatenate along seq_length dimension -> [batch_size, qk_length, num_heads, head_dim]
             key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=-1)
-            value_layer = torch.cat((past_value.type_as(value_layer),
-                                     value_layer),
-                                    dim=-2)
+            value_layer = torch.cat((past_value.type_as(value_layer), value_layer), dim=-2)
 
         presents = (key_layer, value_layer)
         # Raw attention scores. [batch_size * num_heads, q_length, k_length]
         matmul_result = torch.matmul(query_layer, key_layer)
         # change view to [batch_size, num_heads, q_length, k_length]
-        attention_scores = matmul_result.view(output_size[0],
-                                              output_size[1],
-                                              output_size[2],
-                                              -1)
-
-        offset = dist.get_rank(
-        ) * self.num_attention_heads_per_partition if dist.is_initialized() else 0
-        attention_probs = self.softmax_func(
-            attn_scores=attention_scores,
-            attn_mask=((1 - input_mask).half() * minus_inf),
-            alibi=alibi,
-            triangular=(self.config.triangular_masking
-                        and (attention_scores.shape[-2] > 1)),
-            recompute=False,
-            local_attention=False,
-            window_size=1,
-            async_op=False,
-            layer_scale=1 / (self.norm_factor * self.norm_factor),
-            head_offset=offset)
+        attention_scores = matmul_result.view(output_size[0], output_size[1], output_size[2], -1)
+
+        offset = dist.get_rank() * self.num_attention_heads_per_partition if dist.is_initialized() else 0
+        attention_probs = self.softmax_func(attn_scores=attention_scores,
+                                            attn_mask=((1 - input_mask).half() * minus_inf),
+                                            alibi=alibi,
+                                            triangular=(self.config.triangular_masking
+                                                        and (attention_scores.shape[-2] > 1)),
+                                            recompute=False,
+                                            local_attention=False,
+                                            window_size=1,
+                                            async_op=False,
+                                            layer_scale=1 / (self.norm_factor * self.norm_factor),
+                                            head_offset=offset)
 
         # change view [batch_size x num_heads, q_length, k_length]
         attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
@@ -263,10 +268,8 @@ class BloomSelfAttention(DeepSpeedSelfAttention):
 
         # change view [batch_size, num_heads, q_length, head_dim]
         context_layer = context_layer.view(
-            context_layer.size(0) // self.num_attention_heads_per_partition,
-            self.num_attention_heads_per_partition,
-            context_layer.size(1),
-            context_layer.shape[-1])
+            context_layer.size(0) // self.num_attention_heads_per_partition, self.num_attention_heads_per_partition,
+            context_layer.size(1), context_layer.shape[-1])
 
         context_layer = self._transpose_for_context(context_layer)
         key_layer = presents[0]
diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py
index 02d492d8e32b38ed82269fba7ce264d7328984be..a4375178347a87940c1ce9f68735298cb2766c5a 100644
--- a/deepspeed/ops/transformer/inference/ds_mlp.py
+++ b/deepspeed/ops/transformer/inference/ds_mlp.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import torch
@@ -11,46 +12,41 @@ from .op_binding import MLPGemmOp, VectorMatMulOp, GELUGemmOp, ResidualAddOp
 
 
 class DeepSpeedMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False):
+
+    def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False):
         super(DeepSpeedMLP, self).__init__()
 
         self.config = config
         data_type = torch.int8 if config.q_int8 else torch.half if config.fp16 else torch.float
         data_type_fp = torch.half if config.fp16 else torch.float
         device = get_accelerator().current_device_name()
-        self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        intm_size_per_partition = self.config.intermediate_size // self.config.mp_size
-        self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                intm_size_per_partition,
-                                                dtype=data_type,
-                                                device=device),
-                                    requires_grad=False)
-        self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition,
-                                                dtype=data_type_fp,
-                                                device=device),
-                                    requires_grad=False)
-        self.output_w = nn.Parameter(torch.empty(intm_size_per_partition,
-                                                 self.config.hidden_size,
-                                                 dtype=data_type,
-                                                 device=device),
-                                     requires_grad=False)
-        self.output_b = nn.Parameter(torch.empty(self.config.hidden_size,
-                                                 dtype=data_type_fp,
-                                                 device=device),
-                                     requires_grad=False)
+        if self.config.set_empty_params:
+            self.attn_nw = None
+            self.attn_nb = None
+            self.inter_w = None
+            self.inter_b = None
+            self.output_w = None
+            self.output_b = None
+        else:
+            self.attn_nw = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+            self.attn_nb = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+            intm_size_per_partition = self.config.intermediate_size // self.config.mp_size
+            self.inter_w = nn.Parameter(torch.empty(self.config.hidden_size,
+                                                    intm_size_per_partition,
+                                                    dtype=data_type,
+                                                    device=device),
+                                        requires_grad=False)
+            self.inter_b = nn.Parameter(torch.empty(intm_size_per_partition, dtype=data_type_fp, device=device),
+                                        requires_grad=False)
+            self.output_w = nn.Parameter(torch.empty(intm_size_per_partition,
+                                                     self.config.hidden_size,
+                                                     dtype=data_type,
+                                                     device=device),
+                                         requires_grad=False)
+            self.output_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type_fp, device=device),
+                                         requires_grad=False)
 
         # used for quantization
         self.q_scales = q_scales
@@ -79,16 +75,13 @@ class DeepSpeedMLP(nn.Module):
                                                       bias=self.inter_b,
                                                       gamma=self.attn_nw,
                                                       beta=self.attn_nb)
-        residual = self.residual_add_func(
-            hidden_state=output,
-            residual=residual,
-            attention_output=input,
-            attention_bias=bias if bias is not None else self.output_b,
-            final_bias=self.output_b,
-            add_bias=bias is not None,
-            residual_add=residual_add)
-
+        residual = self.residual_add_func(hidden_state=output,
+                                          residual=residual,
+                                          attention_output=input,
+                                          attention_bias=bias if bias is not None else self.output_b,
+                                          final_bias=self.output_b,
+                                          add_bias=bias is not None,
+                                          residual_add=residual_add)
         if self.mp_group is not None and dist.get_world_size(group=self.mp_group) > 1:
             dist.all_reduce(residual, group=self.mp_group)
-
         return residual
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index d5e45c7eb00949474a1faf54991999e4af104d16..bf14a5fc36b2e999865455c1b4154144b2b0a0d8 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 import math
 import torch
@@ -43,6 +45,7 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
             scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
     """
+
     def __init__(self,
                  hidden_size=-1,
                  intermediate_size=-1,
@@ -72,23 +75,10 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
                  mlp_type='standard',
                  scale_attn_by_inverse_layer_idx=False):
         super(DeepSpeedMoEInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers,
-                  layer_norm_eps,
-                  local_rank,
-                  mp_size,
-                  fp16,
-                  q_int8,
-                  pre_layer_norm,
-                  stochastic_mode,
-                  scale_attention,
-                  triangular_masking,
-                  local_attention,
-                  window_size,
-                  return_tuple)
+              self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             num_hidden_layers, layer_norm_eps, local_rank, mp_size, fp16, q_int8, pre_layer_norm,
+                             stochastic_mode, scale_attention, triangular_masking, local_attention, window_size,
+                             return_tuple)
         self.moe_experts = moe_experts
         self.k = k
         self.capacity_factor = capacity_factor
@@ -116,44 +106,21 @@ class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
 
 
 class DeepSpeedMLPFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                inter_w,
-                inter_b,
-                config,
-                output_b,
-                output_w,
-                q_scales,
-                q_groups,
-                merge_count,
-                mp_group,
+    def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group,
                 async_op):
         if config.q_int8:
-            intermediate = inference_cuda_module.fused_gemm_gelu_int8(
-                input,
-                inter_w,
-                inter_b,
-                config.epsilon,
-                q_scales[2],
-                (q_groups * (2**merge_count)),
-                config.pre_layer_norm)
-            output = inference_cuda_module.vector_matmul_int8(intermediate,
-                                                              output_w,
-                                                              q_scales[3],
-                                                              q_groups,
+            intermediate = inference_cuda_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon,
+                                                                      q_scales[2], (q_groups * (2**merge_count)),
+                                                                      config.pre_layer_norm)
+            output = inference_cuda_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups,
                                                               (merge_count))
         else:
             mlp_gemm_func = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \
                                     inference_cuda_module.fused_gemm_gelu_fp32
 
-            output = mlp_gemm_func(input,
-                                   inter_w,
-                                   inter_b,
-                                   output_w,
-                                   config.epsilon,
-                                   config.pre_layer_norm,
-                                   async_op)
+            output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
         if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
             dist.all_reduce(output, group=mp_group, async_op=async_op)
 
@@ -166,24 +133,17 @@ class DeepSpeedMLPFunction(Function):
 
 
 class DeepSpeedMoEMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False,
-                 mp_group=None):
+
+    def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_grouping=False, mp_group=None):
         super(DeepSpeedMoEMLP, self).__init__()
 
         self.config = config
         self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
         self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        interm_size = self.config.intermediate_size // (
-            1 if mp_group is None else dist.get_world_size(group=mp_group))
+        interm_size = self.config.intermediate_size // (1 if mp_group is None else dist.get_world_size(group=mp_group))
         self.inter_w = nn.Parameter(torch.Tensor(self.config.hidden_size, interm_size))
         self.inter_b = nn.Parameter(torch.Tensor(interm_size))
-        self.output_w = nn.Parameter(torch.Tensor((interm_size),
-                                                  self.config.hidden_size))
+        self.output_w = nn.Parameter(torch.Tensor((interm_size), self.config.hidden_size))
         self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
         # used for quantization
@@ -193,17 +153,8 @@ class DeepSpeedMoEMLP(nn.Module):
         self.mp_group = mp_group
 
     def forward(self, input, async_op=False):
-        return DeepSpeedMLPFunction.apply(input,
-                                          self.inter_w,
-                                          self.inter_b,
-                                          self.config,
-                                          self.output_b,
-                                          self.output_w,
-                                          self.q_scales,
-                                          self.q_groups,
-                                          self.merge_count,
-                                          self.mp_group,
-                                          async_op)
+        return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w,
+                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op)
 
 
 class DeepSpeedMoEInference(nn.Module):
@@ -251,11 +202,7 @@ class DeepSpeedMoEInference(nn.Module):
         self.config.specialized_mode = specialized_mode
 
         DeepSpeedMoEInference.layer_id += 1
-        self.attention = DeepSpeedSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count)
+        self.attention = DeepSpeedSelfAttention(self.config, mp_group, quantize_scales, quantize_groups, merge_count)
         self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
         self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
@@ -263,11 +210,7 @@ class DeepSpeedMoEInference(nn.Module):
         self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
 
         if config.mlp_type == 'residual':
-            self.res_mlp = DeepSpeedMoEMLP(config,
-                                           quantize_scales,
-                                           quantize_groups,
-                                           merge_count,
-                                           mlp_extra_grouping,
+            self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping,
                                            mp_group)
             self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2))
             self.coef_func = inference_cuda_module.softmax_fp16 if self.config.fp16 or self.config.q_int8 else \
@@ -277,21 +220,12 @@ class DeepSpeedMoEInference(nn.Module):
 
         config.mp_size = 1
         self.mlp = nn.ModuleList(
-            DeepSpeedMoEMLP(config,
-                            quantize_scales,
-                            quantize_groups,
-                            merge_count,
-                            mlp_extra_grouping,
-                            expert_mp_group) for i in range(self.config.moe_experts))
-
-        self.moe_gate = TopKGate(self.config.hidden_size,
-                                 self.config.global_experts,
-                                 self.config.k,
-                                 self.config.capacity_factor,
-                                 self.config.eval_capacity_factor,
-                                 self.config.min_capacity,
-                                 self.config.noisy_gate_policy,
-                                 self.config.drop_tokens,
+            DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping, expert_mp_group)
+            for i in range(self.config.moe_experts))
+
+        self.moe_gate = TopKGate(self.config.hidden_size, self.config.global_experts, self.config.k,
+                                 self.config.capacity_factor, self.config.eval_capacity_factor,
+                                 self.config.min_capacity, self.config.noisy_gate_policy, self.config.drop_tokens,
                                  self.config.use_rts)
 
         self.ep_group = ep_group
@@ -315,19 +249,14 @@ class DeepSpeedMoEInference(nn.Module):
         _, combined_weights, dispatch_mask, _ = self.moe_gate(
             attention_output.view(-1, self.config.hidden_size),
             None,
-            )
-        dispatched_attention = self.einsum_sec_sm_ecm(
-            dispatch_mask.type_as(attention_output),
-            attention_output.view(-1,
-                                  self.config.hidden_size))
+        )
+        dispatched_attention = self.einsum_sec_sm_ecm(dispatch_mask.type_as(attention_output),
+                                                      attention_output.view(-1, self.config.hidden_size))
         return dispatched_attention, combined_weights
 
     def expert_exec(self, dispatched_input):
-        dispatched_input = dispatched_input.reshape(
-            self.config.global_experts // self.config.moe_experts,
-            self.config.moe_experts,
-            -1,
-            self.config.hidden_size)
+        dispatched_input = dispatched_input.reshape(self.config.global_experts // self.config.moe_experts,
+                                                    self.config.moe_experts, -1, self.config.hidden_size)
 
         chunks = dispatched_input.chunk(self.config.moe_experts, dim=1)
         expert_outputs = torch.empty((
@@ -337,29 +266,22 @@ class DeepSpeedMoEInference(nn.Module):
                                      dtype=dispatched_input.dtype,
                                      device=dispatched_input.device)
         for chunk, expert in zip(chunks, range(len(self.mlp))):
-            expert_outputs[expert] = self.mlp[expert](chunk.view(
-                -1,
-                dispatched_input.shape[-2],
-                dispatched_input.shape[-1]))
+            expert_outputs[expert] = self.mlp[expert](chunk.view(-1, dispatched_input.shape[-2],
+                                                                 dispatched_input.shape[-1]))
         return expert_outputs
 
     def _alltoall(self, dispatched_attention):
         if dist.get_world_size(group=self.ep_group) > 1:
             dispatched_input = torch.empty_like(dispatched_attention)
-            dist.all_to_all_single(dispatched_input,
-                                   dispatched_attention,
-                                   group=self.ep_group)
+            dist.all_to_all_single(dispatched_input, dispatched_attention, group=self.ep_group)
             return dispatched_input
         else:
             return dispatched_attention
 
     def scale_expert_output(self, attention_output, expert_output, combined_weights):
         combined_output = torch.matmul(
-            combined_weights.type_as(attention_output).reshape(
-                combined_weights.shape[0],
-                -1),
-            expert_output.reshape(-1,
-                                  expert_output.shape[-1]))
+            combined_weights.type_as(attention_output).reshape(combined_weights.shape[0], -1),
+            expert_output.reshape(-1, expert_output.shape[-1]))
         return combined_output.reshape(attention_output.shape)
 
     def forward(self,
@@ -385,16 +307,9 @@ class DeepSpeedMoEInference(nn.Module):
             input = input.half()
 
         with torch.no_grad():
-            attention_output = self.attention(input,
-                                              input_mask,
-                                              head_mask,
-                                              layer_past,
-                                              get_present,
-                                              encoder_hidden_states,
-                                              encoder_attention_mask,
-                                              output_attentions,
-                                              self.norm_w,
-                                              self.norm_b)
+            attention_output = self.attention(input, input_mask, head_mask, layer_past, get_present,
+                                              encoder_hidden_states, encoder_attention_mask, output_attentions,
+                                              self.norm_w, self.norm_b)
 
             if get_present:
                 attention_output, p_key, p_value = attention_output[0:3]
@@ -405,10 +320,7 @@ class DeepSpeedMoEInference(nn.Module):
                 attention_output = attention_output[0]
 
             residual_add = attention_output + self.attention.attn_ob
-            attention_output = self.ds_layernorm(residual_add,
-                                                 self.attn_nw,
-                                                 self.attn_nb,
-                                                 self.config.epsilon)
+            attention_output = self.ds_layernorm(residual_add, self.attn_nw, self.attn_nb, self.config.epsilon)
 
             if self.config.mlp_type == 'residual':
                 res_mlp_out = self.res_mlp(attention_output, async_op=True)
@@ -416,13 +328,10 @@ class DeepSpeedMoEInference(nn.Module):
 
             if self.expert_mp_group is not None:
                 tensor_list = [
-                    torch.empty_like(attention_output)
-                    for _ in range(dist.get_world_size(group=self.expert_mp_group))
+                    torch.empty_like(attention_output) for _ in range(dist.get_world_size(group=self.expert_mp_group))
                 ]
                 tensor_list[dist.get_rank(group=self.expert_mp_group)] = attention_output
-                dist.all_gather(tensor_list,
-                                attention_output,
-                                group=self.expert_mp_group)
+                dist.all_gather(tensor_list, attention_output, group=self.expert_mp_group)
                 attention_output = torch.cat(tensor_list).contiguous()
 
             ############## MoE Gating + Experts ###############
@@ -430,14 +339,11 @@ class DeepSpeedMoEInference(nn.Module):
             dispatched_input = self._alltoall(dispatched_attention)
             expert_outputs = self.expert_exec(dispatched_input)
             expert_output = self._alltoall(expert_outputs)
-            output = self.scale_expert_output(attention_output,
-                                              expert_output,
-                                              combined_weights)
+            output = self.scale_expert_output(attention_output, expert_output, combined_weights)
             ################################################
 
             if self.expert_mp_group is not None:
-                output = output.split(output.shape[0] //
-                                      dist.get_world_size(group=self.expert_mp_group),
+                output = output.split(output.shape[0] // dist.get_world_size(group=self.expert_mp_group),
                                       dim=0)[dist.get_rank(group=self.expert_mp_group)]
 
             if self.config.mlp_type == 'residual':
@@ -446,10 +352,7 @@ class DeepSpeedMoEInference(nn.Module):
             output = self.bias_residual_func(output, residual_add, torch.empty(1))
 
             if not self.config.pre_layer_norm:
-                output = self.ds_layernorm(output,
-                                           self.norm_w,
-                                           self.norm_b,
-                                           self.config.epsilon)
+                output = self.ds_layernorm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             if input_type != output.dtype:
                 output = output.to(input_type)
diff --git a/deepspeed/ops/transformer/inference/op_binding/__init__.py b/deepspeed/ops/transformer/inference/op_binding/__init__.py
index 890290692e66350b6a8843f6444994a4a696116e..20b7bf12a917865a753d9db41851562edc8cb337 100644
--- a/deepspeed/ops/transformer/inference/op_binding/__init__.py
+++ b/deepspeed/ops/transformer/inference/op_binding/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .linear import LinearOp
 from .vector_matmul import VectorMatMulOp
diff --git a/deepspeed/ops/transformer/inference/op_binding/base.py b/deepspeed/ops/transformer/inference/op_binding/base.py
index ad29e0f773cbbc33138de257711eebe8d7370fdc..1bdfdeeb5fb12f166ad0db8521410e3647fff2eb 100644
--- a/deepspeed/ops/transformer/inference/op_binding/base.py
+++ b/deepspeed/ops/transformer/inference/op_binding/base.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
index 9ab4ef92687029b94b246ad33aa583362ee66c8d..06fd6f560cb051b06e002455d487357dea867e3f 100644
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class GELUGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(GELUGemmOp, self).__init__(config)
         if self.config.fp16:
@@ -19,14 +23,9 @@ class GELUGemmOp(BaseOp):
                 bias: torch.Tensor,
                 weight_out: torch.Tensor,
                 async_op: bool = False):
-        output = self.fused_gemm_gelu(input,
-                                      weight,
-                                      weight.scale,
-                                      bias,
-                                      weight_out,
-                                      weight_out.scale,
-                                      self.config.epsilon,
-                                      self.config.pre_layer_norm,
-                                      self.config.q_int8,
-                                      async_op)
+        output = self.fused_gemm_gelu(input, weight, weight.scale if hasattr(weight, 'scale') else torch.empty(1),
+                                      bias, weight_out,
+                                      weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1),
+                                      self.config.epsilon, self.config.pre_layer_norm, self.config.q_int8, async_op,
+                                      self.config.transposed_mode)
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/linear.py b/deepspeed/ops/transformer/inference/op_binding/linear.py
index 6d83ffce22bdc266a079d40c53eab119ce4249d8..9178c5f1fc5b5c9eb8e471b76056ad594e1ab541 100644
--- a/deepspeed/ops/transformer/inference/op_binding/linear.py
+++ b/deepspeed/ops/transformer/inference/op_binding/linear.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class LinearOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(LinearOp, self).__init__(config)
         if self.config.fp16:
@@ -22,10 +26,6 @@ class LinearOp(BaseOp):
                 num_heads: int,
                 external_cache: bool = None,
                 num_layers: int = None):
-        qkv_out = self.linear_func(input,
-                                   weight,
-                                   bias,
-                                   add_bias,
-                                   do_flash_attn,
-                                   num_heads)
+        qkv_out = self.linear_func(input, weight, bias, add_bias, do_flash_attn, num_heads,
+                                   self.config.transposed_mode)
         return qkv_out
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index 4df8ef52c3fb94b3af5ad8fc829e8b68e34f6dcb..e7ca40219c34757f21703a2fde3cffd0d84e0e0e 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class MLPGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(MLPGemmOp, self).__init__(config)
         if self.config.fp16:
@@ -13,29 +17,13 @@ class MLPGemmOp(BaseOp):
         else:
             self.mlp_gemm_func = self.inference_cuda_module.mlp_gemm_fp32
 
-    def forward(self,
-                input: torch.Tensor,
-                residual: torch.Tensor,
-                input_bias: torch.Tensor,
-                weight_interm: torch.Tensor,
-                weight_out: torch.Tensor,
-                bias: torch.Tensor,
-                gamma: torch.Tensor,
+    def forward(self, input: torch.Tensor, residual: torch.Tensor, input_bias: torch.Tensor,
+                weight_interm: torch.Tensor, weight_out: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
                 beta: torch.Tensor):
         output, residual_add = self.mlp_gemm_func(
-                                    input,
-                                    residual,
-                                    input_bias,
-                                    weight_interm,
-                                    weight_out,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    self.config.pre_layer_norm,
-                                    self.config.mlp_after_attn,
-                                    weight_interm.scale,
-                                    weight_out.scale,
-                                    self.config.q_int8,
-                                    self.config.mlp_act_func_type)
+            input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, self.config.epsilon,
+            self.config.pre_layer_norm, self.config.mlp_after_attn,
+            weight_interm.scale if hasattr(weight_interm, 'scale') else torch.empty(1),
+            weight_out.scale if hasattr(weight_out, 'scale') else torch.empty(1), self.config.q_int8,
+            self.config.mlp_act_func_type, self.config.transposed_mode)
         return output, residual_add
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index d50dbfd3e7edcb1ab3ba5ce021135143fd5df90d..6b338b9041d9a308291204373ec09eb281d5d30c 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -7,6 +10,7 @@ from deepspeed import comm as dist
 
 
 class QKVGemmOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(QKVGemmOp, self).__init__(config)
         if self.config.fp16:
@@ -24,21 +28,11 @@ class QKVGemmOp(BaseOp):
                 num_layers: int,
                 num_heads: int = None,
                 max_out_tokens: int = None):
-        q_scale = weight.scale
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
         external_cache = self.config.bigscience_bloom
         rank = dist.get_rank() if dist.is_initialized() else 0
         q_int8 = self.config.q_int8
-        output = self.qkv_gemm_func(input,
-                                    weight,
-                                    q_scale,
-                                    bias,
-                                    gamma,
-                                    beta,
-                                    self.config.epsilon,
-                                    add_bias,
-                                    num_layers,
-                                    external_cache,
-                                    self.config.mp_size,
-                                    rank,
-                                    q_int8)
+        output = self.qkv_gemm_func(input, weight, q_scale, bias, gamma, beta, self.config.epsilon, add_bias,
+                                    num_layers, external_cache, self.config.mp_size, rank, q_int8,
+                                    self.config.transposed_mode)
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 0fb1741a0095cad76dbcf5a4571b30a70d02a5c3..e79f5dee5c54aaff0728440a4f2c6b4339494cea 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class ResidualAddOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(ResidualAddOp, self).__init__(config)
         if self.config.fp16 or self.config.q_int8:
@@ -13,26 +17,13 @@ class ResidualAddOp(BaseOp):
         else:
             self.residual_add_func = self.inference_cuda_module.residual_add_bias_fp32
 
-    def forward(self,
-                hidden_state: torch.Tensor,
-                residual: torch.Tensor,
-                attention_output: torch.Tensor,
-                attention_bias: torch.Tensor,
-                final_bias: torch.Tensor,
-                add_bias: bool,
-                residual_add: torch.Tensor):
+    def forward(self, hidden_state: torch.Tensor, residual: torch.Tensor, attention_output: torch.Tensor,
+                attention_bias: torch.Tensor, final_bias: torch.Tensor, add_bias: bool, residual_add: torch.Tensor):
 
         if not self.config.pre_layer_norm and residual_add is not None:
             # only use residual add if its set and we are not pre layer norm
             residual = residual_add
 
-        self.residual_add_func(hidden_state,
-                               residual,
-                               attention_output,
-                               attention_bias,
-                               final_bias,
-                               self.config.mp_size,
-                               self.config.mlp_after_attn,
-                               add_bias,
-                               self.config.pre_layer_norm)
+        self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                               self.config.mp_size, self.config.mlp_after_attn, add_bias, self.config.pre_layer_norm)
         return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index 4d58ba4a4825246e4b37afb166db08072cc0af09..529df9ed6181e3bdce34d89ee7bd886584ebe912 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class SoftmaxOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(SoftmaxOp, self).__init__(config)
         if self.config.fp16:
@@ -16,26 +20,9 @@ class SoftmaxOp(BaseOp):
     def _not_implemented(self, *args, **kwargs):
         raise NotImplementedError
 
-    def forward(self,
-                attn_scores: torch.Tensor,
-                attn_mask: torch.Tensor,
-                alibi: torch.Tensor,
-                triangular: bool,
-                recompute: bool,
-                local_attention: bool,
-                window_size: int,
-                async_op: bool,
-                layer_scale: float,
+    def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
+                recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
                 head_offset: int):
-        output = self.softmax_func(attn_scores,
-                                   attn_mask,
-                                   alibi,
-                                   triangular,
-                                   recompute,
-                                   local_attention,
-                                   window_size,
-                                   async_op,
-                                   layer_scale,
-                                   head_offset,
-                                   self.config.mp_size)
+        output = self.softmax_func(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
+                                   async_op, layer_scale, head_offset, self.config.mp_size)
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 818af5f34a23de1637ccb572ce01008b5e5d82d4..1a132982aba6b33cead2e9513697d5c2f737ee3c 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -7,6 +10,7 @@ from .base import BaseOp
 
 
 class SoftmaxContextOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(SoftmaxContextOp, self).__init__(config)
         if self.config.fp16:
@@ -14,15 +18,8 @@ class SoftmaxContextOp(BaseOp):
         else:
             self.softmax_context_func = self.inference_cuda_module.softmax_context_fp32
 
-    def forward(self,
-                query_key_value: torch.Tensor,
-                attn_mask: torch.Tensor,
-                heads: int,
-                norm_factor: float,
-                no_masking: bool,
-                layer_id: int,
-                num_layers: int,
-                alibi: torch.Tensor):
+    def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, norm_factor: float,
+                no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -31,18 +28,8 @@ class SoftmaxContextOp(BaseOp):
         else:
             alibi = torch.empty(1)
 
-        output = self.softmax_context_func(query_key_value,
-                                           attn_mask,
-                                           self.config.rotary_dim,
-                                           self.config.rotate_half,
-                                           self.config.rotate_every_two,
-                                           heads,
-                                           norm_factor,
-                                           self.config.triangular_masking,
-                                           self.config.local_attention,
-                                           self.config.window_size,
-                                           no_masking,
-                                           layer_id,
-                                           num_layers,
-                                           alibi)
+        output = self.softmax_context_func(query_key_value, attn_mask, self.config.rotary_dim, self.config.rotate_half,
+                                           self.config.rotate_every_two, heads, norm_factor,
+                                           self.config.triangular_masking, self.config.local_attention,
+                                           self.config.window_size, no_masking, layer_id, num_layers, alibi)
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
index c22b61cbd7e577e9ff4fb2aa028ea1fd32386efc..f916020baa9e04294d0f5051f8dbab59c39f1304 100644
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..config import DeepSpeedInferenceConfig
@@ -6,6 +9,7 @@ from .base import BaseOp
 
 
 class VectorMatMulOp(BaseOp):
+
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(VectorMatMulOp, self).__init__(config)
         if self.config.fp16:
@@ -14,7 +18,7 @@ class VectorMatMulOp(BaseOp):
             self.vector_matmul_func = self.inference_cuda_module.vector_matmul_fp32
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
-        q_scale = weight.scale
+        q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
         q_int8 = self.config.q_int8
-        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8)
+        output = self.vector_matmul_func(input, weight, async_op, q_scale, q_int8, self.config.transposed_mode)
         return output
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
deleted file mode 100644
index aed03148e919533fec5281d0a9337e9a1c7af9f7..0000000000000000000000000000000000000000
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ /dev/null
@@ -1,667 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import json
-import math
-import importlib
-import torch
-from torch import nn
-from torch.autograd import Function
-import time
-from ... import op_builder
-import torch.nn as nn
-import torch.distributed as dist
-# Cuda modules will be imported if needed
-inference_cuda_module = None
-
-
-class TransformerConfig():
-    def __init__(self, hidden_size, intermediate_size, heads, num_hidden_layers):
-        self.layer_id = -1
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.heads = heads
-        self.num_hidden_layers = num_hidden_layers
-
-
-class DeepSpeedInferenceConfig(TransformerConfig):
-    """Initialize the DeepSpeed Transformer Config.
-        Arguments:
-            hidden_size: The hidden size of the transformer layer
-            intermediate_size: The intermediate size of the feed-forward part of transformer layer
-            heads: The number of heads in the self-attention of the transformer layer
-            num_hidden_layers: The number of transformer layers
-            layer_norm_eps: The epsilon value for the layer norm
-            local_rank: Optional: The rank of GPU running the transformer kernel, it is not required
-                to use if the model already set the current device, otherwise need to set it
-                so that the transformer kernel can work on the right device
-            mp_size (optional): This argument is mainly used to create the parameters on the kernel side
-                using model-parallel architecture. If the client model already takes care of this, there is no
-                need to pass this argument.
-            fp16: Enable half-precision computation
-            pre_layer_norm: Select between Pre-LN or Post-LN transformer architecture
-            stochastic_mode:  Enable for high performance, please note that this flag has some level of
-                non-determinism and can produce different results on different runs.  However, we have seen
-                that by enabling it, the pretraining tasks such as BERT are not affected and can obtain
-                a high accuracy level. On the other hand, for the downstream tasks, such as fine-tuning, we recommend
-                to turn it off in order to be able to reproduce the same result through the regular kernel execution.
-
-            scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
-            return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
-    """
-    def __init__(self,
-                 hidden_size=-1,
-                 intermediate_size=-1,
-                 heads=-1,
-                 num_hidden_layers=-1,
-                 layer_norm_eps=1e-12,
-                 local_rank=-1,
-                 mp_size=1,
-                 fp16=False,
-                 q_int8=False,
-                 pre_layer_norm=True,
-                 stochastic_mode=False,
-                 scale_attention=True,
-                 triangular_masking=True,
-                 local_attention=False,
-                 window_size=256,
-                 rotary_dim=-1,
-                 rotate_half=False,
-                 rotate_every_two=True,
-                 return_tuple=True,
-                 mlp_after_attn=True,
-                 training_mp_size=1):
-        super(DeepSpeedInferenceConfig,
-              self).__init__(
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  num_hidden_layers)
-        self.fp16 = fp16
-        self.pre_layer_norm = pre_layer_norm
-        self.local_rank = local_rank
-        self.stochastic_mode = stochastic_mode
-        self.epsilon = layer_norm_eps
-        self.mp_size = mp_size
-        self.q_int8 = q_int8
-        self.scale_attention = scale_attention
-        self.triangular_masking = triangular_masking
-        self.local_attention = local_attention
-        self.window_size = window_size
-        self.rotary_dim = rotary_dim
-        self.rotate_half = rotate_half
-        self.rotate_every_two = rotate_every_two
-        self.return_tuple = return_tuple
-        self.mlp_after_attn = mlp_after_attn
-        self.specialized_mode = False
-        self.training_mp_size = training_mp_size
-
-    @classmethod
-    def from_dict(cls, json_object):
-        config = DeepSpeedInferenceConfig()
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-
-class DeepSpeedSelfAttentionFunction(Function):
-    @staticmethod
-    def forward(ctx,
-                input,
-                input_mask,
-                head_mask,
-                layer_past,
-                get_present,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                output_attentions,
-                norm_w,
-                norm_b,
-                config,
-                attn_qkvw,
-                attn_qkvb,
-                num_attention_heads_per_partition,
-                norm_factor,
-                hidden_size_per_partition,
-                attn_ow,
-                attn_ob,
-                mp_group,
-                q_scales,
-                q_groups,
-                merge_count,
-                qkv_merging):
-        def _transpose_for_scores(x, key=False, reshape=False):
-            attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-            new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                           attention_head_size)
-            x_1 = x.view(*new_x_shape)
-            if key:
-                x_1 = x_1.permute(0, 2, 3, 1)
-            else:
-                x_1 = x_1.permute(0, 2, 1, 3)
-            if reshape:
-                return x_1.reshape(x.shape)
-            return x_1.contiguous()
-
-        def _transpose_for_context(x):
-            x = x.permute(0, 2, 1, 3).contiguous()
-            new_x_layer_shape = x.size()[:-2] + \
-                                      (hidden_size_per_partition,)
-            return x.view(*new_x_layer_shape).contiguous()
-
-        def compute_attention(qkv_out, input_mask):
-            score_context_func = inference_cuda_module.softmax_context_fp32 if (not config.fp16) else \
-                                    inference_cuda_module.softmax_context_fp16
-
-            if merge_count > 0 and config.q_int8:
-                split_dim = (qkv_out.dim() - 1)
-                qkv_split = torch.split(qkv_out,
-                                        (qkv_out.shape[-1] // (2**merge_count)),
-                                        dim=split_dim)
-                qkv_split = [
-                    torch.split(s,
-                                (s.shape[-1] // 3),
-                                dim=split_dim) for s in qkv_split
-                ]
-                (mixed_query,
-                 key_layer,
-                 value_layer) = [
-                     torch.cat([s[i] for s in qkv_split],
-                               axis=-1) for i in range(len(qkv_split[0]))
-                 ]
-            else:
-                (mixed_query,
-                 key_layer,
-                 value_layer) = torch.split(qkv_out,
-                                            (qkv_out.shape[-1] // 3),
-                                            dim=(qkv_out.dim() - 1))
-            no_masking = input_mask is None
-            if no_masking:
-                input_mask = torch.empty(1)
-            head_size = (mixed_query.shape[-1] // num_attention_heads_per_partition)
-
-            unfused_mode = not config.specialized_mode or \
-                                mixed_query.shape[1] >= 32 or head_size > 128
-
-            if config.rotary_dim > 0:
-                mixed_query, key_layer = inference_cuda_module.apply_rotary_pos_emb(
-                    mixed_query,
-                    key_layer,
-                    config.rotary_dim,
-                    0 if layer_past is None else layer_past[0].shape[-2],
-                    num_attention_heads_per_partition,
-                    config.rotate_half,
-                    config.rotate_every_two)
-            if layer_past is not None:
-                past_key, past_value = layer_past
-                if unfused_mode:
-                    key_layer = torch.cat((past_key.type_as(key_layer),
-                                           key_layer),
-                                          dim=-2)
-                    value_layer = torch.cat((past_value.type_as(value_layer),
-                                             value_layer),
-                                            dim=-2)
-            presents = (key_layer, value_layer)
-            if unfused_mode:
-                mixed_query = _transpose_for_scores(mixed_query, False, True)
-                key_layer = _transpose_for_scores(
-                    key_layer,
-                    True,
-                    True) / (norm_factor if config.scale_attention else 1.0)
-                value_layer = _transpose_for_scores(value_layer, False, True)
-            #print(f'[{torch.distributed.get_rank()}] {config.layer_id}: {mixed_query.norm()}')
-            if layer_past is None:
-                attn_key_value = score_context_func(
-                    mixed_query,
-                    key_layer,
-                    torch.empty(1),
-                    input_mask,
-                    value_layer,
-                    torch.empty(1),
-                    num_attention_heads_per_partition,
-                    (1 / norm_factor if config.scale_attention else 1.0),
-                    (not unfused_mode),
-                    config.triangular_masking,
-                    config.local_attention,
-                    config.window_size,
-                    no_masking)
-            else:
-                attn_key_value = score_context_func(
-                    mixed_query,
-                    (key_layer if unfused_mode else past_key.type_as(key_layer)),
-                    key_layer,
-                    input_mask,
-                    (value_layer if unfused_mode else past_value.type_as(value_layer)),
-                    value_layer,
-                    num_attention_heads_per_partition,
-                    (1 / norm_factor if config.scale_attention else 1.0),
-                    (not unfused_mode),
-                    config.triangular_masking,
-                    config.local_attention,
-                    config.window_size,
-                    no_masking)
-            if unfused_mode:
-                context_layer, _, _ = attn_key_value
-            else:
-                context_layer, key_layer, value_layer = attn_key_value
-
-            # Transpose Context
-            context_layer = _transpose_for_context(context_layer)
-
-            return context_layer, presents[0], presents[1] # atten_output, key_layer, value_layer
-
-        def selfAttention_fp():
-            vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
-                                    inference_cuda_module.vector_matmul_fp32
-            if not config.pre_layer_norm:
-                linear_func = inference_cuda_module.linear_layer_fp16 if config.fp16 else \
-                                    inference_cuda_module.linear_layer_fp32
-
-                qkv_out = linear_func(input, attn_qkvw, attn_qkvb)
-            else:
-                qkv_func = inference_cuda_module.qkv_gemm_fp16 if config.fp16 else \
-                                    inference_cuda_module.qkv_gemm_fp32
-                qkv_out = qkv_func(input,
-                                   attn_qkvw,
-                                   (attn_qkvb if attn_qkvb is not None else norm_b),
-                                   norm_w,
-                                   norm_b,
-                                   config.epsilon,
-                                   (attn_qkvb is not None))
-
-            context_layer, key_layer, value_layer = compute_attention(qkv_out[0] if isinstance(qkv_out, list) else qkv_out, input_mask)
-            output = vector_matmul_func(context_layer, attn_ow, False)
-            #print(f'[{torch.distributed.get_rank()}] {config.layer_id}: oooooo -> {output.norm()}')
-
-            return output, key_layer, value_layer, context_layer, qkv_out[-1] # attn_out, present_key, present_value, context_output, inp_norm
-
-        def selfAttention_int8():
-            if not config.pre_layer_norm:
-                qkv_out = inference_cuda_module.linear_layer_int8(
-                    input,
-                    attn_qkvw,
-                    attn_qkvb,
-                    q_scales[0],
-                    (q_groups * (3 if qkv_merging else 1) * (2**merge_count)))
-
-            else:
-                qkv_out = inference_cuda_module.qkv_gemm_int8(
-                    input,
-                    attn_qkvw,
-                    attn_qkvb,
-                    norm_w,
-                    norm_b,
-                    config.epsilon,
-                    q_scales[0],
-                    (q_groups * (3 if qkv_merging else 1) * (2**merge_count)),
-                    (attn_qkvb is not None))
-            context_layer, key_layer, value_layer = compute_attention(qkv_out)
-            output = inference_cuda_module.vector_matmul_int8(context_layer,
-                                                              attn_ow,
-                                                              q_scales[1],
-                                                              q_groups,
-                                                              (merge_count))
-            return output, key_layer, value_layer, context_layer
-
-        if config.q_int8:
-            output, key_layer, value_layer, context_layer = selfAttention_int8()
-        else:
-            output, key_layer, value_layer, context_layer, inp_norm = selfAttention_fp()
-        if config.mlp_after_attn and mp_group is not None and dist.get_world_size(
-                group=mp_group) > 1:
-            dist.all_reduce(output, group=mp_group)
-
-        return (output, key_layer, value_layer, context_layer, inp_norm)
-
-    @staticmethod
-    def backward(ctx, grad_output, grad_output1, grad_output2, grad_output3):
-        raise RuntimeError('You are running with DeepSpeed Inference mode. \
-                            Please switch to Training mode for running backward!')
-
-
-class DeepSpeedSelfAttention(nn.Module):
-    num_layers = 0
-
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 qkv_merging=False):
-        super(DeepSpeedSelfAttention, self).__init__()
-        self.config = config
-        self.config.layer_id = DeepSpeedSelfAttention.num_layers
-        DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
-        self.attn_qkvw = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         (self.config.hidden_size // self.config.mp_size) * 3))
-        self.attn_qkvb = nn.Parameter(
-            torch.Tensor((self.config.hidden_size // self.config.mp_size) * 3))
-
-        self.attn_ow = nn.Parameter(
-            torch.Tensor(self.config.hidden_size // self.config.mp_size,
-                         self.config.hidden_size))
-
-        self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
-
-        self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
-        self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
-        self.hidden_size_per_attention_head = self.config.hidden_size // self.config.heads
-
-        self.mp_group = mp_group
-
-        # used for quantization
-        self.q_scales = q_scales
-        self.q_groups = q_groups
-        self.merge_count = int(math.log2(merge_count))
-
-        self.norm_factor = math.sqrt(
-            math.sqrt(self.config.hidden_size // self.config.heads))
-        self.qkv_merging = qkv_merging
-
-    def forward(self,
-                input,
-                input_mask,
-                head_mask=None,
-                layer_past=None,
-                get_present=False,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                output_attentions=False,
-                norm_w=None,
-                norm_b=None):
-        output = DeepSpeedSelfAttentionFunction.apply(
-            input,
-            input_mask,
-            head_mask,
-            layer_past,
-            get_present,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            output_attentions,
-            norm_w,
-            norm_b,
-            self.config,
-            self.attn_qkvw,
-            self.attn_qkvb,
-            self.num_attention_heads_per_partition,
-            self.norm_factor,
-            self.hidden_size_per_partition,
-            self.attn_ow,
-            self.attn_ob,
-            self.mp_group,
-            self.q_scales,
-            self.q_groups,
-            self.merge_count,
-            self.qkv_merging)
-
-        return output
-
-
-class DeepSpeedMLPFunction(Function):
-    @staticmethod
-    def forward(ctx,
-                input,
-                residual,
-                residual_norm,
-                bias,
-                inter_w,
-                inter_b,
-                attn_nw,
-                attn_nb,
-                config,
-                mp_group,
-                output_b,
-                output_w,
-                q_scales,
-                q_groups,
-                merge_count,
-                mlp_gemm_func,
-                fused_gemm_gelu,
-                vector_matmul_func,
-                bias_residual_func):
-
-        if config.q_int8:
-            (intermediate,
-             residual_add) = inference_cuda_module.mlp_gemm_int8(
-                 input,
-                 residual,
-                 bias,
-                 inter_w,
-                 inter_b,
-                 attn_nw,
-                 attn_nb,
-                 config.epsilon,
-                 q_scales[2],
-                 (q_groups * (2**merge_count)),
-                 config.pre_layer_norm)
-            output = inference_cuda_module.vector_matmul_int8(intermediate,
-                                                              output_w,
-                                                              q_scales[3],
-                                                              q_groups,
-                                                              (merge_count))
-        else:
-            if attn_nw is None:
-                output = fused_gemm_gelu(residual_norm,
-                                         inter_w,
-                                         inter_b,
-                                         output_w,
-                                         config.epsilon,
-                                         config.pre_layer_norm,
-                                         False)
-            else:
-                intermediate = mlp_gemm_func(input,
-                                             residual,
-                                             bias,
-                                             inter_w,
-                                             inter_b,
-                                             attn_nw,
-                                             attn_nb,
-                                             config.epsilon,
-                                             config.pre_layer_norm,
-                                             config.mlp_after_attn)
-                output = vector_matmul_func(intermediate, output_w, False)
-        inference_cuda_module.residual_add(output,
-                                           residual,
-                                           input,
-                                           output_b,
-                                           bias,
-                                           config.mp_size,
-                                           config.mlp_after_attn)
-        if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
-            dist.all_reduce(output, group=mp_group)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        raise RuntimeError('You are running with DeepSpeed Inference mode. \
-                            Please switch to Training mode for running backward!')
-
-
-class DeepSpeedMLP(nn.Module):
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 q_scales=None,
-                 q_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False):
-        super(DeepSpeedMLP, self).__init__()
-
-        self.config = config
-        self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.inter_w = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         self.config.intermediate_size // self.config.mp_size))
-        self.inter_b = nn.Parameter(
-            torch.Tensor(self.config.intermediate_size // self.config.mp_size))
-        self.output_w = nn.Parameter(
-            torch.Tensor((self.config.intermediate_size // self.config.mp_size),
-                         self.config.hidden_size))
-        self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
-
-        # used for quantization
-        self.q_scales = q_scales
-        self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
-        self.merge_count = int(math.log2(merge_count))
-
-        self.mp_group = mp_group
-        self.mlp_gemm_func = inference_cuda_module.mlp_gemm_fp16 if config.fp16 else \
-                                    inference_cuda_module.mlp_gemm_fp32
-        self.vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
-                                inference_cuda_module.vector_matmul_fp32
-        self.fused_gemm_gelu = inference_cuda_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_cuda_module.fused_gemm_gelu_fp32
-
-        self.bias_residual_func = inference_cuda_module.bias_residual_fp16 if config.fp16 or config.q_int8 else \
-                                    inference_cuda_module.bias_residual_fp32
-
-    def forward(self, input, residual, residual_norm, bias):
-        return DeepSpeedMLPFunction.apply(input,
-                                          residual,
-                                          residual_norm,
-                                          bias,
-                                          self.inter_w,
-                                          self.inter_b,
-                                          self.attn_nw,
-                                          self.attn_nb,
-                                          self.config,
-                                          self.mp_group,
-                                          self.output_b,
-                                          self.output_w,
-                                          self.q_scales,
-                                          self.q_groups,
-                                          self.merge_count,
-                                          self.mlp_gemm_func,
-                                          self.fused_gemm_gelu,
-                                          self.vector_matmul_func,
-                                          self.bias_residual_func)
-
-
-class DeepSpeedTransformerInference(nn.Module):
-    """Initialize the DeepSpeed Transformer Layer.
-        Arguments:
-            layer_id: The layer index starting from 0, e.g. if model has 24 transformer layers,
-                layer_id will be 0,1,2...23 when each layer object is instantiated
-            config: An object of DeepSpeedInferenceConfig
-            mp_group: Model parallelism group initialized on the modeling side.
-            quantize_scales: This argument groups all the layers' scales used for quantization
-            quantize_groups: Number of groups used for quantizing the model
-            merge_count: Shows the number of model-parallel checkpoints merged before running inference.
-                We use this argument to control the quantization scale for the model parameters if a bigger
-                quantize-grouping than 1 is used.
-            mlp_extra_grouping: This flag is used to show a 2x higher number of groups used for the MLP part
-                of a Transformer layer. We use this feature for quantization to reduce the convergence impact
-                for specific downstream tasks.
-    """
-    layer_id = 0
-
-    def __init__(self,
-                 config,
-                 mp_group=None,
-                 quantize_scales=None,
-                 quantize_groups=1,
-                 merge_count=1,
-                 mlp_extra_grouping=False,
-                 qkv_merging=False):
-        super(DeepSpeedTransformerInference, self).__init__()
-
-        self.config = config
-        self.config.layer_id = DeepSpeedTransformerInference.layer_id
-        DeepSpeedTransformerInference.layer_id += 1
-
-        global inference_cuda_module
-        if inference_cuda_module is None:
-            builder = op_builder.InferenceBuilder()
-            inference_cuda_module = builder.load()
-
-        print("DeepSpeed Transformer Inference config is ", self.config.__dict__)
-
-        self.attention = DeepSpeedSelfAttention(self.config,
-                                                mp_group,
-                                                quantize_scales,
-                                                quantize_groups,
-                                                merge_count,
-                                                qkv_merging)
-        self.mlp = DeepSpeedMLP(self.config,
-                                mp_group,
-                                quantize_scales,
-                                quantize_groups,
-                                merge_count,
-                                mlp_extra_grouping)
-
-        self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.layer_past = None
-
-    def forward(self,
-                input,
-                input_mask=None,
-                attention_mask=None,
-                head_mask=None,
-                layer_past=None,
-                get_key_value=False,
-                get_present=False,
-                encoder_output=None,
-                enc_dec_attn_mask=None,
-                encoder_hidden_states=None,
-                encoder_attention_mask=None,
-                use_cache=False,
-                output_attentions=False):
-        get_present = (get_present or get_key_value or use_cache)
-        input_mask = input_mask if attention_mask is None else attention_mask
-        layer_past = layer_past if layer_past is not None else self.layer_past
-
-        attn_mask = None
-        if isinstance(input, tuple):
-            attn_mask = input[1]
-            input = input[0]
-        input_type = input.dtype
-
-        if (self.config.fp16 or self.config.q_int8) \
-            and input.dtype == torch.float:
-            input = input.half()
-
-        with torch.no_grad():
-            attention_output, key, value, context_outputtn_ctx, inp_norm = \
-                                     self.attention(input,
-                                              input_mask,
-                                              head_mask,
-                                              layer_past,
-                                              get_present,
-                                              encoder_hidden_states,
-                                              encoder_attention_mask,
-                                              output_attentions,
-                                              self.norm_w,
-                                              self.norm_b)
-            presents = (key, value)
-            self.layer_past = presents
-
-            output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
-
-            if not self.config.pre_layer_norm:
-                ds_layernorm = inference_cuda_module.layer_norm_fp16 if self.config.fp16 or self.config.q_int8 else \
-                                        inference_cuda_module.layer_norm_fp32
-                output = ds_layernorm(output,
-                                      self.norm_w,
-                                      self.norm_b,
-                                      self.config.epsilon)
-
-            output = output.to(input_type)
-        #print(f'[{torch.distributed.get_rank()}] {self.config.layer_id}: {output.norm()}')
-        #exit()
-        if get_present:
-            output = (output, presents)
-
-        if self.config.return_tuple:
-            return output if type(output) is tuple else (output, attn_mask)
-        else:
-            return output
diff --git a/deepspeed/ops/transformer/inference/triton_ops.py b/deepspeed/ops/transformer/inference/triton_ops.py
index 45c6db66d1078f9f5442807a6d4f757ad12adec6..0c9c53ab1de19f71af4af2ff46e60307ec206cc9 100644
--- a/deepspeed/ops/transformer/inference/triton_ops.py
+++ b/deepspeed/ops/transformer/inference/triton_ops.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 Inspired by original Triton implementation:
 https://github.com/openai/triton/blob/b244db06da24a87453a40ad35b085ee37dac3705/python/tutorials/06-fused-attention.py
@@ -102,6 +105,7 @@ def _fwd_kernel(
 
 
 class triton_flash_attn(torch.nn.Module):
+
     def __init__(self, ):
         super(triton_flash_attn, self).__init__()
 
@@ -111,10 +115,7 @@ class triton_flash_attn(torch.nn.Module):
         Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
         o = torch.empty_like(q)
         grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
-        tmp = torch.empty((q.shape[0] * q.shape[1],
-                           q.shape[2]),
-                          device=q.device,
-                          dtype=torch.float32)
+        tmp = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
         num_warps = 4 if Lk <= 64 else 8
 
         _fwd_kernel[grid](
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
index ae627304f02bbdd1d90f2859a20f2823154dc5db..bfd4d60dcb1ceafff2d87b62a78a8cf480ab9448 100755
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import json
 import math
 import torch
@@ -15,15 +17,9 @@ stochastic_transformer_cuda_module = None
 
 
 class TransformerConfig():
-    def __init__(self,
-                 batch_size,
-                 hidden_size,
-                 intermediate_size,
-                 heads,
-                 attn_dropout_ratio,
-                 hidden_dropout_ratio,
-                 num_hidden_layers,
-                 initializer_range):
+
+    def __init__(self, batch_size, hidden_size, intermediate_size, heads, attn_dropout_ratio, hidden_dropout_ratio,
+                 num_hidden_layers, initializer_range):
         self.layer_id = -1
         self.batch_size = batch_size
         self.hidden_size = hidden_size
@@ -89,6 +85,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
 
             training: Enable for training rather than inference.
     """
+
     def __init__(self,
                  batch_size=-1,
                  hidden_size=-1,
@@ -111,15 +108,9 @@ class DeepSpeedTransformerConfig(TransformerConfig):
                  return_tuple=False,
                  training=True):
         super(DeepSpeedTransformerConfig,
-              self).__init__(
-                  batch_size,
-                  hidden_size,
-                  (intermediate_size if intermediate_size > 0 else 4 * hidden_size),
-                  heads,
-                  attn_dropout_ratio,
-                  hidden_dropout_ratio,
-                  num_hidden_layers,
-                  initializer_range)
+              self).__init__(batch_size, hidden_size,
+                             (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
+                             attn_dropout_ratio, hidden_dropout_ratio, num_hidden_layers, initializer_range)
         self.fp16 = fp16
         self.pre_layer_norm = pre_layer_norm
         self.local_rank = local_rank
@@ -150,97 +141,42 @@ class DeepSpeedTransformerConfig(TransformerConfig):
 
 
 class DeepSpeedTransformerFunction(Function):
+
     @staticmethod
-    def forward(ctx,
-                input,
-                input_mask,
-                self,
-                grads,
-                layer_id,
-                attn_qkvw,
-                attn_qkvb,
-                attn_ow,
-                attn_ob,
-                attn_nw,
-                attn_nb,
-                inter_w,
-                inter_b,
-                output_w,
-                output_b,
-                norm_w,
-                norm_b,
-                config):
+    def forward(ctx, input, input_mask, self, grads, layer_id, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b, config):
 
         cuda_module = stochastic_transformer_cuda_module if config.stochastic_mode else transformer_cuda_module
         forward_func = cuda_module.forward_fp16 if config.fp16 else cuda_module.forward_fp32
 
         inp_size = input.size()
         if inp_size[1] % 16 != 0:
-            input = torch.cat((input,
-                               torch.randn((inp_size[0],
-                                            (16 - (inp_size[1] % 16)),
-                                            inp_size[2]),
-                                           device=input.device,
-                                           dtype=input.dtype)),
-                              1)
+            input = torch.cat(
+                (input,
+                 torch.randn(
+                     (inp_size[0], (16 - (inp_size[1] % 16)), inp_size[2]), device=input.device, dtype=input.dtype)),
+                1)
             input_mask = torch.cat((input_mask, torch.ones((inp_size[0], input_mask.shape[1], input_mask.shape[2], \
                                             (16 - (inp_size[1] % 16))), device=input_mask.device, dtype=input_mask.dtype) * -10000), 3)
 
-        (output,
-         inp_norm,
-         qkv_tf,
-         soft_inp,
-         ctx_bufB,
-         attn_o_inp,
-         add_res,
-         ff1_inp,
-         gelu_inp,
-         ff2_inp,
-         attn_prob_dropout_mask,
-         attn_output_dropout_mask,
-         layer_output_dropout_mask,
-         attn_layer_norm_var,
-         attn_layer_norm_mean,
-         layer_norm_var,
-         layer_norm_mean) = forward_func(config.layer_id,
-                                         input,
-                                         input_mask,
-                                         attn_qkvw,
-                                         attn_qkvb,
-                                         attn_ow,
-                                         attn_ob,
-                                         attn_nw,
-                                         attn_nb,
-                                         inter_w,
-                                         inter_b,
-                                         output_w,
-                                         output_b,
-                                         norm_w,
-                                         norm_b,
-                                         config.training and config.is_grad_enabled,
-                                         config.pre_layer_norm,
-                                         config.attn_dropout_checkpoint,
-                                         config.normalize_invertible,
-                                         config.gelu_checkpoint)
+        (output, inp_norm, qkv_tf, soft_inp, ctx_bufB, attn_o_inp, add_res, ff1_inp, gelu_inp, ff2_inp,
+         attn_prob_dropout_mask, attn_output_dropout_mask, layer_output_dropout_mask, attn_layer_norm_var,
+         attn_layer_norm_mean, layer_norm_var, layer_norm_mean) = forward_func(
+             config.layer_id, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+             inter_b, output_w, output_b, norm_w, norm_b, config.training and config.is_grad_enabled,
+             config.pre_layer_norm, config.attn_dropout_checkpoint, config.normalize_invertible,
+             config.gelu_checkpoint)
 
         # For testing only.
         if grads is not None:
             for i in [2]:
-                attn_qkvw.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
-                    ]))
+                attn_qkvw.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_W" if i == 0 else "K_W" if i == 1 else "V_W")
+                ]))
             for i in [2]:
-                attn_qkvb.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: grads.append([
-                        x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)],
-                        ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
-                    ]))
+                attn_qkvb.register_hook(lambda x, i=i, self=self: grads.append([
+                    x[i * attn_ow.size(0):(i + 1) * attn_ow.size(0)], ("Q_B" if i == 0 else "K_B" if i == 1 else "V_B")
+                ]))
 
             attn_ow.register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             attn_ob.register_hook(lambda x, self=self: grads.append([x, "O_B"]))
@@ -255,35 +191,11 @@ class DeepSpeedTransformerFunction(Function):
 
         if config.is_grad_enabled and config.training:
             if (config.pre_layer_norm and config.normalize_invertible):
-                ctx.save_for_backward(input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w,
+                                      inter_b, output_w, output_b, norm_w, norm_b)
             else:
-                ctx.save_for_backward(output,
-                                      input,
-                                      input_mask,
-                                      attn_qkvw,
-                                      attn_qkvb,
-                                      attn_ow,
-                                      attn_ob,
-                                      attn_nw,
-                                      attn_nb,
-                                      inter_w,
-                                      inter_b,
-                                      output_w,
-                                      output_b,
-                                      norm_w,
-                                      norm_b)
+                ctx.save_for_backward(output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw,
+                                      attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
 
             ctx.config = config
             if (config.pre_layer_norm or not config.normalize_invertible):
@@ -331,88 +243,28 @@ class DeepSpeedTransformerFunction(Function):
         assert ctx.config.training
 
         if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible):
-            (input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w,
+             output_b, norm_w, norm_b) = ctx.saved_tensors
         else:
-            (output,
-             input,
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b) = ctx.saved_tensors
+            (output, input, input_mask, attn_qkvw, attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b,
+             output_w, output_b, norm_w, norm_b) = ctx.saved_tensors
 
         cuda_module = stochastic_transformer_cuda_module if ctx.config.stochastic_mode else transformer_cuda_module
         backward_func = cuda_module.backward_fp16 if ctx.config.fp16 else cuda_module.backward_fp32
 
-        (grad_input,
-         grad_attn_qkvw,
-         grad_attn_qkvb,
-         grad_attn_ow,
-         grad_attn_ob,
-         grad_attn_nw,
-         grad_attn_nb,
-         grad_inter_w,
-         grad_inter_b,
-         grad_output_w,
-         grad_output_b,
-         grad_norm_w,
-         grad_norm_b) = backward_func(
-             ctx.config.layer_id,
-             grad_output,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else output),
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               or not ctx.config.normalize_invertible) else input),
-             ctx.qkv_tf,
-             ctx.soft_inp,
-             (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
-             ctx.attn_o_inp,
-             (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res),
-             ctx.ff1_inp,
-             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp),
-             ctx.ff2_inp,
-             ctx.attn_prob_dropout_mask,
-             ctx.attn_output_dropout_mask,
-             ctx.layer_output_dropout_mask,
-             ctx.attn_layer_norm_var,
-             ctx.attn_layer_norm_mean,
-             ctx.layer_norm_var,
-             ctx.layer_norm_mean,
-             (ctx.inp_norm if (ctx.config.pre_layer_norm
-                               and ctx.config.normalize_invertible) else input),
-             input_mask,
-             attn_qkvw,
-             attn_qkvb,
-             attn_ow,
-             attn_ob,
-             attn_nw,
-             attn_nb,
-             inter_w,
-             inter_b,
-             output_w,
-             output_b,
-             norm_w,
-             norm_b)
+        (grad_input, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob, grad_attn_nw, grad_attn_nb,
+         grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w, grad_norm_b) = backward_func(
+             ctx.config.layer_id, grad_output,
+             (ctx.inp_norm if (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else output),
+             (ctx.inp_norm if (ctx.config.pre_layer_norm or not ctx.config.normalize_invertible) else input),
+             ctx.qkv_tf, ctx.soft_inp, (ctx.soft_inp if ctx.config.attn_dropout_checkpoint else ctx.ctx_bufB),
+             ctx.attn_o_inp, (ctx.ff1_inp if ctx.config.normalize_invertible else ctx.add_res), ctx.ff1_inp,
+             (ctx.ff2_inp if ctx.config.gelu_checkpoint else ctx.gelu_inp), ctx.ff2_inp, ctx.attn_prob_dropout_mask,
+             ctx.attn_output_dropout_mask, ctx.layer_output_dropout_mask, ctx.attn_layer_norm_var,
+             ctx.attn_layer_norm_mean, ctx.layer_norm_var, ctx.layer_norm_mean,
+             (ctx.inp_norm if
+              (ctx.config.pre_layer_norm and ctx.config.normalize_invertible) else input), input_mask, attn_qkvw,
+             attn_qkvb, attn_ow, attn_ob, attn_nw, attn_nb, inter_w, inter_b, output_w, output_b, norm_w, norm_b)
 
         # This appears to be an effective way to release context memory
         ctx.qkv_tf = None
@@ -436,24 +288,9 @@ class DeepSpeedTransformerFunction(Function):
         if grad_output_shape[1] % 16 != 0:
             grad_input = torch.narrow(grad_input, 1, 0, grad_output_shape[1])
 
-        return (grad_input,
-                None,
-                None,
-                None,
-                None,
-                grad_attn_qkvw,
-                grad_attn_qkvb,
-                grad_attn_ow,
-                grad_attn_ob,
-                grad_attn_nw,
-                grad_attn_nb,
-                grad_inter_w,
-                grad_inter_b,
-                grad_output_w,
-                grad_output_b,
-                grad_norm_w,
-                grad_norm_b,
-                None)
+        return (grad_input, None, None, None, None, grad_attn_qkvw, grad_attn_qkvb, grad_attn_ow, grad_attn_ob,
+                grad_attn_nw, grad_attn_nb, grad_inter_w, grad_inter_b, grad_output_w, grad_output_b, grad_norm_w,
+                grad_norm_b, None)
 
 
 class DeepSpeedTransformerLayer(nn.Module):
@@ -484,23 +321,15 @@ class DeepSpeedTransformerLayer(nn.Module):
             get_accelerator().set_device(self.config.local_rank)
 
         if initial_weights is None and initial_biases is None:
-            self.attn_qkvw = nn.Parameter(
-                torch.Tensor(self.config.hidden_size * 3,
-                             self.config.hidden_size))
+            self.attn_qkvw = nn.Parameter(torch.Tensor(self.config.hidden_size * 3, self.config.hidden_size))
             self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
-            self.attn_ow = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.hidden_size))
+            self.attn_ow = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.hidden_size))
             self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
-            self.inter_w = nn.Parameter(
-                torch.Tensor(self.config.intermediate_size,
-                             self.config.hidden_size))
+            self.inter_w = nn.Parameter(torch.Tensor(self.config.intermediate_size, self.config.hidden_size))
             self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
-            self.output_w = nn.Parameter(
-                torch.Tensor(self.config.hidden_size,
-                             self.config.intermediate_size))
+            self.output_w = nn.Parameter(torch.Tensor(self.config.hidden_size, self.config.intermediate_size))
             self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
@@ -539,21 +368,11 @@ class DeepSpeedTransformerLayer(nn.Module):
         cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module
         create_layer_func = cuda_module.create_transformer_layer_fp16 if self.config.fp16 else cuda_module.create_transformer_layer_fp32
 
-        create_layer_func(self.config.layer_id,
-                          self.config.batch_size,
-                          self.config.hidden_size,
-                          self.config.heads,
-                          self.config.intermediate_size,
-                          self.config.attn_dropout_ratio,
-                          self.config.hidden_dropout_ratio,
-                          self.config.layer_norm_eps,
-                          self.config.seed,
-                          self.config.pre_layer_norm,
-                          self.config.test_gemm,
-                          self.config.attn_dropout_checkpoint,
-                          self.config.normalize_invertible,
-                          self.config.gelu_checkpoint,
-                          self.config.stochastic_mode)
+        create_layer_func(self.config.layer_id, self.config.batch_size, self.config.hidden_size, self.config.heads,
+                          self.config.intermediate_size, self.config.attn_dropout_ratio,
+                          self.config.hidden_dropout_ratio, self.config.layer_norm_eps, self.config.seed,
+                          self.config.pre_layer_norm, self.config.test_gemm, self.config.attn_dropout_checkpoint,
+                          self.config.normalize_invertible, self.config.gelu_checkpoint, self.config.stochastic_mode)
 
     def init_transformer_weights(self, adjust_init_range=False):
         num_layers = self.config.num_hidden_layers
@@ -587,21 +406,7 @@ class DeepSpeedTransformerLayer(nn.Module):
                 grads=None):
         self.config.is_grad_enabled = torch.is_grad_enabled()
         self.config.training = self.training
-        return DeepSpeedTransformerFunction.apply(hidden_states,
-                                                  attention_mask,
-                                                  self,
-                                                  grads,
-                                                  self.config.layer_id,
-                                                  self.attn_qkvw,
-                                                  self.attn_qkvb,
-                                                  self.attn_ow,
-                                                  self.attn_ob,
-                                                  self.attn_nw,
-                                                  self.attn_nb,
-                                                  self.inter_w,
-                                                  self.inter_b,
-                                                  self.output_w,
-                                                  self.output_b,
-                                                  self.norm_w,
-                                                  self.norm_b,
-                                                  self.config)
+        return DeepSpeedTransformerFunction.apply(hidden_states, attention_mask, self, grads, self.config.layer_id,
+                                                  self.attn_qkvw, self.attn_qkvb, self.attn_ow, self.attn_ob,
+                                                  self.attn_nw, self.attn_nb, self.inter_w, self.inter_b,
+                                                  self.output_w, self.output_b, self.norm_w, self.norm_b, self.config)
diff --git a/deepspeed/pipe/__init__.py b/deepspeed/pipe/__init__.py
index bbabf4feb7e03df2e5fe587ba5870ccd2cc57101..f25471713b4568b43216fe16786a46904575ae5e 100644
--- a/deepspeed/pipe/__init__.py
+++ b/deepspeed/pipe/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
diff --git a/deepspeed/profiling/__init__.py b/deepspeed/profiling/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/profiling/__init__.py
+++ b/deepspeed/profiling/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
index c22cd453fcba7fc8e67b07d5b6ce70f6f3c1dd36..7533fc299f0e11448526c60c8d4f5894726cf963 100644
--- a/deepspeed/profiling/config.py
+++ b/deepspeed/profiling/config.py
@@ -1,14 +1,14 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.profiling.constants import *
 
 
 class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
@@ -25,26 +25,18 @@ class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
-        self.enabled = get_scalar_param(flops_profiler_dict,
-                                        FLOPS_PROFILER_ENABLED,
-                                        FLOPS_PROFILER_ENABLED_DEFAULT)
+        self.enabled = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_ENABLED, FLOPS_PROFILER_ENABLED_DEFAULT)
 
-        self.profile_step = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_PROFILE_STEP,
+        self.profile_step = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_PROFILE_STEP,
                                              FLOPS_PROFILER_PROFILE_STEP_DEFAULT)
 
-        self.module_depth = get_scalar_param(flops_profiler_dict,
-                                             FLOPS_PROFILER_MODULE_DEPTH,
+        self.module_depth = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_MODULE_DEPTH,
                                              FLOPS_PROFILER_MODULE_DEPTH_DEFAULT)
 
-        self.top_modules = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_TOP_MODULES,
+        self.top_modules = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_TOP_MODULES,
                                             FLOPS_PROFILER_TOP_MODULES_DEFAULT)
 
-        self.detailed = get_scalar_param(flops_profiler_dict,
-                                         FLOPS_PROFILER_DETAILED,
-                                         FLOPS_PROFILER_DETAILED_DEFAULT)
+        self.detailed = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_DETAILED, FLOPS_PROFILER_DETAILED_DEFAULT)
 
-        self.output_file = get_scalar_param(flops_profiler_dict,
-                                            FLOPS_PROFILER_OUTPUT_FILE,
+        self.output_file = get_scalar_param(flops_profiler_dict, FLOPS_PROFILER_OUTPUT_FILE,
                                             FLOPS_PROFILER_OUTPUT_FILE_DEFAULT)
diff --git a/deepspeed/profiling/constants.py b/deepspeed/profiling/constants.py
index 162f1d3e7f2face649f088065ffeb42aa1452633..e16baea27deded355e170f2b927ebd96aeb8b74c 100644
--- a/deepspeed/profiling/constants.py
+++ b/deepspeed/profiling/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #########################################
 # flops profiler
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index 6d749c5df81736b6c34d01dd0407a2625599babb..af23d56ee76ab677932c37a9045eefdbe816ef85 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -309,8 +309,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
 import torchvision.models as models
 import torch
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     model = models.alexnet()
     batch_size = 256
     flops, macs, params = get_model_profile(model=model, # model
@@ -334,6 +335,7 @@ from functools import partial
 import torch
 from transformers import BertForSequenceClassification, BertTokenizer
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
 
 def bert_input_constructor(batch_size, seq_len, tokenizer):
@@ -350,7 +352,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
     return inputs
 
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
     batch_size = 4
diff --git a/deepspeed/profiling/flops_profiler/__init__.py b/deepspeed/profiling/flops_profiler/__init__.py
index 7454821bc1ccfde04924ef3c04642e3b952409e9..980d8a5e92fc4c3491f822479d05c5e2776233df 100644
--- a/deepspeed/profiling/flops_profiler/__init__.py
+++ b/deepspeed/profiling/flops_profiler/__init__.py
@@ -1,3 +1,6 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .profiler import *
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index b6684f6978c370204b04cb09f1946c3b3b794f27..f39f25ce87b1a3b1ce7e920bc0dba879e8cae4a4 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import time
 import torch
@@ -53,6 +56,7 @@ class FlopsProfiler(object):
     Args:
         object (torch.nn.Module): The PyTorch model to profile.
     """
+
     def __init__(self, model, ds_engine=None):
         self.model = model
         self.ds_engine = ds_engine
@@ -78,8 +82,7 @@ class FlopsProfiler(object):
             # if computing the flops of a module directly
             if type(module) in MODULE_HOOK_MAPPING:
                 if not hasattr(module, "__flops_handle__"):
-                    module.__flops_handle__ = module.register_forward_hook(
-                        MODULE_HOOK_MAPPING[type(module)])
+                    module.__flops_handle__ = module.register_forward_hook(MODULE_HOOK_MAPPING[type(module)])
                 return
 
             # if computing the flops of the functionals in a module
@@ -105,16 +108,14 @@ class FlopsProfiler(object):
                 module.__start_time__ = time.time()
 
             if not hasattr(module, "__start_time_hook_handle"):
-                module.__start_time_hook_handle__ = module.register_forward_pre_hook(
-                    start_time_hook)
+                module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
 
             def end_time_hook(module, input, output):
                 get_accelerator().synchronize()
                 module.__duration__ += time.time() - module.__start_time__
 
             if not hasattr(module, "__end_time_hook_handle__"):
-                module.__end_time_hook_handle__ = module.register_forward_hook(
-                    end_time_hook)
+                module.__end_time_hook_handle__ = module.register_forward_hook(end_time_hook)
 
         self.model.apply(partial(register_module_hooks, ignore_list=ignore_list))
         self.started = True
@@ -154,6 +155,7 @@ class FlopsProfiler(object):
 
         Adds or resets the extra attributes.
         """
+
         def add_or_reset_attrs(module):
             module.__flops__ = 0
             module.__macs__ = 0
@@ -232,15 +234,9 @@ class FlopsProfiler(object):
         Returns:
             The number of parameters in the model.
         """
-        return params_to_string(
-            self.model.__params__) if as_string else self.model.__params__
-
-    def print_model_profile(self,
-                            profile_step=1,
-                            module_depth=-1,
-                            top_modules=1,
-                            detailed=True,
-                            output_file=None):
+        return params_to_string(self.model.__params__) if as_string else self.model.__params__
+
+    def print_model_profile(self, profile_step=1, module_depth=-1, top_modules=1, detailed=True, output_file=None):
         """Prints the model graph with the measured profile attached to each module.
 
         Args:
@@ -273,28 +269,21 @@ class FlopsProfiler(object):
         self.macs = total_macs
         self.params = total_params
 
-        print(
-            "\n-------------------------- DeepSpeed Flops Profiler --------------------------"
-        )
+        print("\n-------------------------- DeepSpeed Flops Profiler --------------------------")
         print(f'Profile Summary at step {profile_step}:')
         print(
             "Notations:\ndata parallel size (dp_size), model parallel size(mp_size),\nnumber of parameters (params), number of multiply-accumulate operations(MACs),\nnumber of floating-point operations (flops), floating-point operations per second (FLOPS),\nfwd latency (forward propagation latency), bwd latency (backward propagation latency),\nstep (weights update latency), iter latency (sum of fwd, bwd and step latency)\n"
         )
         if self.ds_engine:
             print('{:<60}  {:<8}'.format('world size: ', self.ds_engine.world_size))
-            print('{:<60}  {:<8}'.format('data parallel size: ',
-                                         self.ds_engine.dp_world_size))
-            print('{:<60}  {:<8}'.format('model parallel size: ',
-                                         self.ds_engine.mp_world_size))
-            print('{:<60}  {:<8}'.format(
-                'batch size per GPU: ',
-                self.ds_engine.train_micro_batch_size_per_gpu()))
+            print('{:<60}  {:<8}'.format('data parallel size: ', self.ds_engine.dp_world_size))
+            print('{:<60}  {:<8}'.format('model parallel size: ', self.ds_engine.mp_world_size))
+            print('{:<60}  {:<8}'.format('batch size per GPU: ', self.ds_engine.train_micro_batch_size_per_gpu()))
 
         print('{:<60}  {:<8}'.format('params per gpu: ', params_to_string(total_params)))
         print('{:<60}  {:<8}'.format(
             'params of model = params per GPU * mp_size: ',
-            params_to_string(total_params *
-                             ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
+            params_to_string(total_params * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
 
         print('{:<60}  {:<8}'.format('fwd MACs per GPU: ', macs_to_string(total_macs)))
 
@@ -302,43 +291,33 @@ class FlopsProfiler(object):
 
         print('{:<60}  {:<8}'.format(
             'fwd flops of model = fwd flops per GPU * mp_size: ',
-            num_to_string(total_flops *
-                          ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
+            num_to_string(total_flops * ((self.ds_engine.mp_world_size) if self.ds_engine else 1))))
 
         fwd_latency = self.get_total_duration()
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
             fwd_latency = self.ds_engine.timers('forward').elapsed(False) / 1000.0
         print('{:<60}  {:<8}'.format('fwd latency: ', duration_to_string(fwd_latency)))
-        print('{:<60}  {:<8}'.format(
-            'fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
-            flops_to_string(total_flops / fwd_latency)))
+        print('{:<60}  {:<8}'.format('fwd FLOPS per GPU = fwd flops per GPU / fwd latency: ',
+                                     flops_to_string(total_flops / fwd_latency)))
 
         if self.ds_engine and self.ds_engine.wall_clock_breakdown():
             bwd_latency = self.ds_engine.timers('backward').elapsed(False) / 1000.0
             step_latency = self.ds_engine.timers('step').elapsed(False) / 1000.0
-            print('{:<60}  {:<8}'.format('bwd latency: ',
-                                         duration_to_string(bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
-                flops_to_string(2 * total_flops / bwd_latency)))
-            print('{:<60}  {:<8}'.format(
-                'fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
-                flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
-
-            print('{:<60}  {:<8}'.format('step latency: ',
-                                         duration_to_string(step_latency)))
+            print('{:<60}  {:<8}'.format('bwd latency: ', duration_to_string(bwd_latency)))
+            print('{:<60}  {:<8}'.format('bwd FLOPS per GPU = 2 * fwd flops per GPU / bwd latency: ',
+                                         flops_to_string(2 * total_flops / bwd_latency)))
+            print('{:<60}  {:<8}'.format('fwd+bwd FLOPS per GPU = 3 * fwd flops per GPU / (fwd+bwd latency): ',
+                                         flops_to_string(3 * total_flops / (fwd_latency + bwd_latency))))
+
+            print('{:<60}  {:<8}'.format('step latency: ', duration_to_string(step_latency)))
 
             iter_latency = fwd_latency + bwd_latency + step_latency
-            print('{:<60}  {:<8}'.format('iter latency: ',
-                                         duration_to_string(iter_latency)))
-            print('{:<60}  {:<8}'.format(
-                'FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
-                flops_to_string(3 * total_flops / iter_latency)))
+            print('{:<60}  {:<8}'.format('iter latency: ', duration_to_string(iter_latency)))
+            print('{:<60}  {:<8}'.format('FLOPS per GPU = 3 * fwd flops per GPU / iter latency: ',
+                                         flops_to_string(3 * total_flops / iter_latency)))
 
-            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu(
-            ) * self.ds_engine.world_size
-            print('{:<60}  {:<8.2f}'.format('samples/second: ',
-                                            samples_per_iter / iter_latency))
+            samples_per_iter = self.ds_engine.train_micro_batch_size_per_gpu() * self.ds_engine.world_size
+            print('{:<60}  {:<8.2f}'.format('samples/second: ', samples_per_iter / iter_latency))
 
         def flops_repr(module):
             params = module.__params__
@@ -353,9 +332,7 @@ class FlopsProfiler(object):
             duration = get_module_duration(module)
 
             items.append(duration_to_string(duration))
-            items.append(
-                "{:.2%} latency".format(0.0 if total_duration == 0 else duration /
-                                        total_duration))
+            items.append("{:.2%} latency".format(0.0 if total_duration == 0 else duration / total_duration))
             items.append(flops_to_string(0.0 if duration == 0 else flops / duration))
             items.append(module.original_extra_repr())
             return ", ".join(items)
@@ -374,16 +351,11 @@ class FlopsProfiler(object):
 
         self.model.apply(add_extra_repr)
 
-        print(
-            "\n----------------------------- Aggregated Profile per GPU -----------------------------"
-        )
-        self.print_model_aggregated_profile(module_depth=module_depth,
-                                            top_modules=top_modules)
+        print("\n----------------------------- Aggregated Profile per GPU -----------------------------")
+        self.print_model_aggregated_profile(module_depth=module_depth, top_modules=top_modules)
 
         if detailed:
-            print(
-                "\n------------------------------ Detailed Profile per GPU ------------------------------"
-            )
+            print("\n------------------------------ Detailed Profile per GPU ------------------------------")
             print(
                 "Each module profile is listed after its name in the following order: \nparams, percentage of total params, MACs, percentage of total MACs, fwd latency, percentage of total fwd latency, fwd FLOPS"
             )
@@ -394,9 +366,7 @@ class FlopsProfiler(object):
 
         self.model.apply(del_extra_repr)
 
-        print(
-            "------------------------------------------------------------------------------"
-        )
+        print("------------------------------------------------------------------------------")
 
         if output_file:
             sys.stdout = original_stdout
@@ -411,9 +381,7 @@ class FlopsProfiler(object):
         """
         info = {}
         if not hasattr(self.model, "__flops__"):
-            print(
-                "no __flops__ attribute in the model, call this function after start_profile and before end_profile"
-            )
+            print("no __flops__ attribute in the model, call this function after start_profile and before end_profile")
             return
 
         def walk_module(module, curr_depth, info):
@@ -439,33 +407,22 @@ class FlopsProfiler(object):
         if module_depth == -1:
             depth = len(info) - 1
 
-        print(
-            f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:'
-        )
+        print(f'Top {top_modules} modules in terms of params, MACs or fwd latency at different model depths:')
 
         for d in range(depth):
             num_items = min(top_modules, len(info[d]))
 
             sort_macs = {
                 k: macs_to_string(v[0])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][0],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][0], reverse=True)[:num_items]
             }
             sort_params = {
                 k: params_to_string(v[1])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][1],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][1], reverse=True)[:num_items]
             }
             sort_time = {
                 k: duration_to_string(v[2])
-                for k,
-                v in sorted(info[d].items(),
-                            key=lambda item: item[1][2],
-                            reverse=True)[:num_items]
+                for k, v in sorted(info[d].items(), key=lambda item: item[1][2], reverse=True)[:num_items]
             }
 
             print(f"depth {d}:")
@@ -499,9 +456,7 @@ def _elu_flops_compute(input: Tensor, alpha: float = 1.0, inplace: bool = False)
     return input.numel(), 0
 
 
-def _leaky_relu_flops_compute(input: Tensor,
-                              negative_slope: float = 0.01,
-                              inplace: bool = False):
+def _leaky_relu_flops_compute(input: Tensor, negative_slope: float = 0.01, inplace: bool = False):
     return input.numel(), 0
 
 
@@ -529,13 +484,7 @@ def _pool_flops_compute(input,
     return input.numel(), 0
 
 
-def _conv_flops_compute(input,
-                        weight,
-                        bias=None,
-                        stride=1,
-                        padding=0,
-                        dilation=1,
-                        groups=1):
+def _conv_flops_compute(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1):
     assert weight.shape[1] * groups == input.shape[1]
 
     batch_size = input.shape[0]
@@ -552,8 +501,8 @@ def _conv_flops_compute(input,
 
     output_dims = []
     for idx, input_dim in enumerate(input_dims):
-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
         output_dims.append(output_dim)
 
     filters_per_channel = out_channels // groups
@@ -594,8 +543,8 @@ def _conv_trans_flops_compute(
     output_dims = []
     for idx, input_dim in enumerate(input_dims):
 
-        output_dim = (input_dim + 2 * paddings[idx] -
-                      (dilations[idx] * (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
+        output_dim = (input_dim + 2 * paddings[idx] - (dilations[idx] *
+                                                       (kernel_dims[idx] - 1) + 1)) // strides[idx] + 1
         output_dims.append(output_dim)
 
     paddings = padding if type(padding) is tuple else (padding, padding)
@@ -1210,8 +1159,7 @@ def get_model_profile(
         assert len(input_shape) >= 1, "input_shape must have at least one element"
         try:
             input = torch.ones(()).new_empty(
-                (*input_shape,
-                 ),
+                (*input_shape, ),
                 dtype=next(model.parameters()).dtype,
                 device=next(model.parameters()).device,
             )
diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py
index aa1f39f9e194699befc81c01eb057de2150549b7..347ff7993d825c50fd38de6791f365d9bb1eeb4b 100644
--- a/deepspeed/runtime/__init__.py
+++ b/deepspeed/runtime/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class DeepSpeedOptimizer(object):
diff --git a/deepspeed/runtime/activation_checkpointing/__init__.py b/deepspeed/runtime/activation_checkpointing/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/activation_checkpointing/__init__.py
+++ b/deepspeed/runtime/activation_checkpointing/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 74372540106f0c3ab7bfdf2e9d647aa303d10574..c305ade64621809e9d9598dae498007137fbedb8 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -1,8 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-'''
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Use to partition the activations stored for backward propagation
 Therefore reduces the memory consumption
 Also implements CPU checkpointing and contiguous memory checkpointing
@@ -10,7 +10,7 @@ Reduces memory consumption and memory fragmentation
 
 Code for rng checkpointing taken from NVIDIA Megatron-LM mpu/random.py
 b886b7bb972afe72bac0f5de4f42a4a7bae8ebef
-'''
+"""
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
@@ -82,9 +82,7 @@ def detach_variable(inputs, device=None):
             out.append(x)
         return tuple(out)
     else:
-        raise RuntimeError(
-            "Only tuple of tensors is supported. Got Unsupported input type: ",
-            type(inputs).__name__)
+        raise RuntimeError("Only tuple of tensors is supported. Got Unsupported input type: ", type(inputs).__name__)
 
 
 def _set_cuda_rng_state(new_state, device=-1):
@@ -92,7 +90,7 @@ def _set_cuda_rng_state(new_state, device=-1):
 
     Arguments:
         new_state (torch.ByteTensor): The desired state
-    This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
+    This function is adapted from PyTorch repo (torch.cuda.set_rng_state) #ignore-cuda
     with a single change: the input state is not cloned. Cloning caused
     major performance issues for +4 GPU cases.
     """
@@ -128,6 +126,7 @@ class CudaRNGStatesTracker:
     rng state, we can perform operations and return to our starting
     cuda state.
     """
+
     def __init__(self):
         # Map from a string name to the cuda rng state.
         self.states_ = {}
@@ -227,13 +226,9 @@ def model_parallel_cuda_manual_seed(seed):
         logger.info(
             '> initializing model parallel cuda seeds on global rank {}, '
             'model parallel rank {}, and data parallel rank {} with '
-            'model parallel seed: {} and data parallel seed: {}'.format(
-                dist.get_rank(),
-                tp_rank,
-                mpu.get_data_parallel_rank(),
-                model_parallel_seed,
-                data_parallel_seed),
-        )
+            'model parallel seed: {} and data parallel seed: {}'.format(dist.get_rank(), tp_rank,
+                                                                        mpu.get_data_parallel_rank(),
+                                                                        model_parallel_seed, data_parallel_seed), )
     _CUDA_RNG_STATE_TRACKER.reset()
     # Set the default state.
     get_accelerator().manual_seed(data_parallel_seed)
@@ -282,9 +277,7 @@ def gather_partitioned_activations(tensors, device=None):
         if device is not None:
             flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=device)
         else:
-            flat_tensor = torch.zeros([tensor_size],
-                                      dtype=item.dtype,
-                                      device=item.device)
+            flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
         partitions = []
         for i in range(mp_size):
             part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
@@ -384,28 +377,21 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
 
         i = arg_index - num_non_fp_tensors
         partition_size = get_partition_size(item)
-        partition = item.detach().contiguous().view(-1).narrow(
-            0,
-            get_partition_start(item),
-            partition_size).clone()
+        partition = item.detach().contiguous().view(-1).narrow(0, get_partition_start(item), partition_size).clone()
 
         buffer_device = torch.device('cpu') if cpu_checkpoint else partition.device
 
         if contiguous_checkpoint:
             if i >= len(contiguous_data_buffers):
                 tensor_list = [
-                    torch.tensor(()).new_empty([partition_size],
-                                               dtype=partition.dtype,
-                                               device=buffer_device)
+                    torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device)
                     for _ in range(num_layers)
                 ]
                 contiguous_data_buffers.append(tensor_list)
                 data_offsets.append(0)
             elif contiguous_data_buffers[i] is None:
                 tensor_list = [
-                    torch.tensor(()).new_empty([partition_size],
-                                               dtype=partition.dtype,
-                                               device=buffer_device)
+                    torch.tensor(()).new_empty([partition_size], dtype=partition.dtype, device=buffer_device)
                     for _ in range(num_layers)
                 ]
                 contiguous_data_buffers[i] = tensor_list
@@ -419,14 +405,10 @@ def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
             # previously launched GPU kernels, there is a small
             # window of time here for CPUs to populate pages asynchronously.
             contiguous_data_buffers[i][data_offsets[i]].data[range(
-                0,
-                contiguous_data_buffers[i][data_offsets[i]].data.shape[0],
-                int(mmap.PAGESIZE /
-                    contiguous_data_buffers[i][data_offsets[i]].data.element_size())
-            )] = 0
-
-            contiguous_partition = contiguous_data_buffers[i][
-                data_offsets[i]].data.copy_(partition.data)
+                0, contiguous_data_buffers[i][data_offsets[i]].data.shape[0],
+                int(mmap.PAGESIZE / contiguous_data_buffers[i][data_offsets[i]].data.element_size()))] = 0
+
+            contiguous_partition = contiguous_data_buffers[i][data_offsets[i]].data.copy_(partition.data)
             data_offsets[i] = data_offsets[i] + 1
             inputs.append(contiguous_partition)
         else:
@@ -459,21 +441,14 @@ def get_partitioned_activations_for_backward(args, inputs, contiguous_checkpoint
             if i >= len(contiguous_size_buffers):
                 tmp = torch.tensor(())
                 contiguous_size_buffers.append(
-                    tmp.new_empty([numel * num_layers],
-                                  dtype=size.dtype,
-                                  device=size.device))
+                    tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device))
                 size_offsets.append(0)
             elif contiguous_size_buffers[i] is None:
                 tmp = torch.tensor(())
-                contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers],
-                                                           dtype=size.dtype,
-                                                           device=size.device)
+                contiguous_size_buffers[i] = tmp.new_empty([numel * num_layers], dtype=size.dtype, device=size.device)
                 size_offsets[i] = 0
 
-            contiguous_size = contiguous_size_buffers[i].narrow(
-                0,
-                size_offsets[i],
-                numel).data.copy_(size.data)
+            contiguous_size = contiguous_size_buffers[i].narrow(0, size_offsets[i], numel).data.copy_(size.data)
             contiguous_size = contiguous_size.view_as(size)
             size_offsets[i] = size_offsets[i] + numel
             new_args.append(contiguous_size)
@@ -499,13 +474,14 @@ def get_cpu_activations_for_backward(args, inputs):
 class CheckpointFunction(torch.autograd.Function):
     """This function is adapted from torch.utils.checkpoint with
        two main changes:
-           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
+           1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`  #ignore-cuda
            2) the states in the model parallel tracker are also properly
               tracked/set/reset.
            3) Performance activation partitioning, contiguous memory optimization
            4) CPU Checkpointing
            5) Profile forward and backward functions
     """
+
     @staticmethod
     def forward(ctx, run_function, all_outputs, *args):
         global mpu, timers, SYNCHRONIZE, PROFILE_TIME
@@ -551,12 +527,9 @@ class CheckpointFunction(torch.autograd.Function):
             see_memory_usage("First Forward Beginning", force=False)
             if dist.get_rank() == 0:
                 logger.info(f"Activation Checkpointing Information")
+                logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
                 logger.info(
-                    f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}"
-                )
-                logger.info(
-                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers"
-                )
+                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
                 logger.info(f"----Synchronization {SYNCHRONIZE}")
                 logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
 
@@ -564,18 +537,12 @@ class CheckpointFunction(torch.autograd.Function):
             transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
-            inputs = partition_activations(args,
-                                           CPU_CHECKPOINT,
-                                           CONTIGUOUS_CHECKPOINTING)
+            inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
         elif CPU_CHECKPOINT:
-            inputs = copy_to_device(args,
-                                    device=torch.device('cpu'),
-                                    criterion_func=is_activation_to_checkpoint)
+            inputs = copy_to_device(args, device=torch.device('cpu'), criterion_func=is_activation_to_checkpoint)
 
         # just in case something funky is happening such as reuse of inputs
-        inputs_cuda = copy_to_device(args,
-                                     device=cuda_device,
-                                     criterion_func=is_activation_to_checkpoint)
+        inputs_cuda = copy_to_device(args, device=cuda_device, criterion_func=is_activation_to_checkpoint)
 
         # Copy the rng states.
         ctx.fwd_cpu_rng_state = torch.get_rng_state()
@@ -591,10 +558,7 @@ class CheckpointFunction(torch.autograd.Function):
         del inputs_cuda
 
         if PARTITION_ACTIVATIONS:
-            new_args = get_partitioned_activations_for_backward(
-                args,
-                inputs,
-                CONTIGUOUS_CHECKPOINTING)
+            new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING)
             assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}'
             save_args_for_backward(*new_args)
         elif CPU_CHECKPOINT:
@@ -613,9 +577,7 @@ class CheckpointFunction(torch.autograd.Function):
         if torch.is_tensor(outputs):
             non_grad_outputs = [outputs] if not outputs.is_floating_point() else []
         else:
-            non_grad_outputs = [
-                o for o in outputs if torch.is_tensor(o) and not o.is_floating_point()
-            ]
+            non_grad_outputs = [o for o in outputs if torch.is_tensor(o) and not o.is_floating_point()]
         ctx.mark_non_differentiable(*non_grad_outputs)
 
         if torch.is_tensor(outputs):
@@ -661,14 +623,11 @@ class CheckpointFunction(torch.autograd.Function):
 
         if PARTITION_ACTIVATIONS:
             # with get_accelerator().stream(transport_stream):
-            inputs = gather_partitioned_activations(
-                ctx.deepspeed_saved_tensors,
-                device=cuda_device if CPU_CHECKPOINT else None)
+            inputs = gather_partitioned_activations(ctx.deepspeed_saved_tensors,
+                                                    device=cuda_device if CPU_CHECKPOINT else None)
             detached_inputs = detach_variable(inputs)
         elif CPU_CHECKPOINT:
-            inputs = move_to_device(ctx.deepspeed_saved_tensors,
-                                    cuda_device,
-                                    is_activation_to_checkpoint)
+            inputs = move_to_device(ctx.deepspeed_saved_tensors, cuda_device, is_activation_to_checkpoint)
             detached_inputs = detach_variable(inputs)
         else:
             inputs = ctx.deepspeed_saved_tensors
@@ -762,8 +721,7 @@ def partition_activations_in_checkpoint(partition_activation):
     global PARTITION_ACTIVATIONS
     PARTITION_ACTIVATIONS = partition_activation
     if dist.get_rank() == 0:
-        logger.info(
-            f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
+        logger.info(f"**************Partition Activations {PARTITION_ACTIVATIONS}************")
 
 
 def set_num_layers(nlayers):
diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py
index 0e79579435b82134676dc47a8c3f625ab3acfd81..13df4b9812985426aa1a895284e321d2877f4631 100755
--- a/deepspeed/runtime/activation_checkpointing/config.py
+++ b/deepspeed/runtime/activation_checkpointing/config.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 
@@ -48,16 +47,15 @@ ACT_CHKPT = 'activation_checkpointing'
 ACT_CHKPT_DEFAULT = {
     ACT_CHKPT_PARTITION_ACTIVATIONS: ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT,
     ACT_CHKPT_NUMBER_CHECKPOINTS: ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT,
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION:
-    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY:
-    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
+    ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION: ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT,
+    ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY: ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT,
     ACT_CHKPT_PROFILE: ACT_CHKPT_PROFILE_DEFAULT,
     ACT_CHKPT_CPU_CHECKPOINTING: ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT
 }
 
 
 class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
+
     def __init__(self, param_dict):
         super(DeepSpeedActivationCheckpointingConfig, self).__init__()
 
@@ -76,29 +74,21 @@ class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
         self._initialize(act_chkpt_config_dict)
 
     def _initialize(self, act_chkpt_config_dict):
-        self.partition_activations = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_PARTITION_ACTIVATIONS,
-            ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
-
-        self.contiguous_memory_optimization = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
-            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
-
-        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict,
-                                                  ACT_CHKPT_CPU_CHECKPOINTING,
+        self.partition_activations = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PARTITION_ACTIVATIONS,
+                                                      ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)
+
+        self.contiguous_memory_optimization = get_scalar_param(act_chkpt_config_dict,
+                                                               ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
+                                                               ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)
+
+        self.cpu_checkpointing = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_CPU_CHECKPOINTING,
                                                   ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)
 
-        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict,
-                                                   ACT_CHKPT_NUMBER_CHECKPOINTS,
+        self.number_checkpoints = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_NUMBER_CHECKPOINTS,
                                                    ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)
 
-        self.profile = get_scalar_param(act_chkpt_config_dict,
-                                        ACT_CHKPT_PROFILE,
-                                        ACT_CHKPT_PROFILE_DEFAULT)
+        self.profile = get_scalar_param(act_chkpt_config_dict, ACT_CHKPT_PROFILE, ACT_CHKPT_PROFILE_DEFAULT)
 
-        self.synchronize_checkpoint_boundary = get_scalar_param(
-            act_chkpt_config_dict,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
-            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
+        self.synchronize_checkpoint_boundary = get_scalar_param(act_chkpt_config_dict,
+                                                                ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
+                                                                ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index f667d279ca542bd32dcea0930bdb1c8b1a2b7325..85e17b7e06bcddc7025c5a7ec02656c11ec7a24d 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from collections import OrderedDict
 import torch
@@ -13,29 +14,21 @@ from deepspeed.runtime import ZeROOptimizer
 from packaging import version as pkg_version
 
 from deepspeed.git_version_info import version
-from deepspeed.runtime.utils import (get_global_norm_of_tensors,
-                                     clip_tensors_by_global_norm,
-                                     DummyOptim,
-                                     align_dense_tensors,
-                                     all_gather_dp_groups,
-                                     bwc_tensor_model_parallel_rank,
-                                     is_model_parallel_parameter,
-                                     see_memory_usage)
+from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim,
+                                     align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank,
+                                     is_model_parallel_parameter, see_memory_usage)
 
 from deepspeed.utils import link_hp_params, fragment_address
 from deepspeed.checkpoint import enable_universal_checkpoint
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            PARTITION_COUNT,
-                                            BASE_OPTIMIZER_STATE,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            CLIP_GRAD,
-                                            GROUP_PADDINGS,
+from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS,
                                             PARAM_SLICE_MAPPINGS)
 
 setattr(sys.modules[__name__], 'fragment_address', fragment_address)
 
 
 class BF16_Optimizer(ZeROOptimizer):
+
     def __init__(self,
                  init_optimizer,
                  param_names,
@@ -58,9 +51,7 @@ class BF16_Optimizer(ZeROOptimizer):
         self.allgather_bucket_size = int(allgather_bucket_size)
         self.dp_process_group = dp_process_group
         self.dp_rank = dist.get_rank(group=self.dp_process_group)
-        self.real_dp_process_group = [
-            dp_process_group for i in range(len(self.optimizer.param_groups))
-        ]
+        self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
 
         # Load pre-built or JIT compile (un)flatten ops
         util_ops = UtilsBuilder().load()
@@ -95,9 +86,7 @@ class BF16_Optimizer(ZeROOptimizer):
 
     def _setup_for_real_optimizer(self):
         dp_world_size = dist.get_world_size(group=self.dp_process_group)
-        self.partition_count = [
-            dp_world_size for i in range(len(self.optimizer.param_groups))
-        ]
+        self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))]
 
         for i, param_group in enumerate(self.optimizer.param_groups):
             see_memory_usage(f'before initializing group {i}', force=True)
@@ -105,69 +94,55 @@ class BF16_Optimizer(ZeROOptimizer):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
 
             # grab the original list
-            self.bf16_groups.append(param_group['params'])
+            trainable_parameters = [param for param in param_group['params'] if param.requires_grad]
+            self.bf16_groups.append(trainable_parameters)
 
             # create flat bf16 params
             self.bf16_groups_flat.append(
-                self._flatten_dense_tensors_aligned(
-                    self.bf16_groups[i],
-                    self.nccl_start_alignment_factor * dp_world_size))
+                self._flatten_dense_tensors_aligned(self.bf16_groups[i],
+                                                    self.nccl_start_alignment_factor * dp_world_size))
 
             # Make bf16 params point to flat tensor storage
-            self._update_storage_to_flattened_tensor(
-                tensor_list=self.bf16_groups[i],
-                flat_tensor=self.bf16_groups_flat[i])
+            self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i],
+                                                     flat_tensor=self.bf16_groups_flat[i])
 
             # divide flat weights into equal sized partitions
             partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
             bf16_dp_partitions = [
-                self.bf16_groups_flat[i].narrow(0,
-                                                dp_index * partition_size,
-                                                partition_size)
+                self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size)
                 for dp_index in range(dp_world_size)
             ]
             self.bf16_partitioned_groups.append(bf16_dp_partitions)
 
             # create fp32 params partition
-            self.fp32_groups_flat_partition.append(
-                bf16_dp_partitions[partition_id].clone().float().detach())
+            self.fp32_groups_flat_partition.append(bf16_dp_partitions[partition_id].clone().float().detach())
             self.fp32_groups_flat_partition[i].requires_grad = True
 
             num_elem_list = [t.numel() for t in self.bf16_groups[i]]
 
             # create fp32 gradients
-            self.fp32_groups_gradients_flat.append(
-                torch.zeros_like(self.bf16_groups_flat[i],
-                                 dtype=torch.float32))
+            self.fp32_groups_gradients_flat.append(torch.zeros_like(self.bf16_groups_flat[i], dtype=torch.float32))
 
             # track individual fp32 gradients for entire model
-            fp32_gradients = self._split_flat_tensor(
-                flat_tensor=self.fp32_groups_gradients_flat[i],
-                num_elem_list=num_elem_list)
+            fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i],
+                                                     num_elem_list=num_elem_list)
             self.fp32_groups_gradients.append(fp32_gradients)
             self.fp32_groups_gradient_dict[i] = fp32_gradients
 
             # flat tensor corresponding to actual fp32 gradients (i.e., minus alignment padding)
             length_without_padding = sum(num_elem_list)
             self.fp32_groups_actual_gradients_flat.append(
-                torch.narrow(self.fp32_groups_gradients_flat[i],
-                             0,
-                             0,
-                             length_without_padding))
+                torch.narrow(self.fp32_groups_gradients_flat[i], 0, 0, length_without_padding))
 
             # flat tensor corresponding to gradient partition
             self.fp32_groups_gradient_flat_partition.append(
-                torch.narrow(self.fp32_groups_gradients_flat[i],
-                             0,
-                             partition_id * partition_size,
-                             partition_size))
+                torch.narrow(self.fp32_groups_gradients_flat[i], 0, partition_id * partition_size, partition_size))
 
             # track fp32 gradient updates
             self.fp32_groups_has_gradients.append([False] * len(self.bf16_groups[i]))
 
             # Record padding required for alignment
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
+            if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                 padding = self.bf16_groups_flat[i].numel() - length_without_padding
             else:
                 padding = 0
@@ -199,8 +174,7 @@ class BF16_Optimizer(ZeROOptimizer):
             for lp in self.bf16_groups[i]:
                 if lp._hp_mapping is not None:
                     lp_name = self.param_names[lp]
-                    param_mapping_per_group[
-                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+                    param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
             param_mapping.append(param_mapping_per_group)
 
         return param_mapping
@@ -212,17 +186,16 @@ class BF16_Optimizer(ZeROOptimizer):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
             flat_hp_partition = self.fp32_groups_flat_partition[i]
-            link_hp_params(
-                lp_param_list=self.bf16_groups[i],
-                flat_hp_partition=flat_hp_partition,
-                gradient_dict=self.fp32_groups_gradient_dict,
-                offload_gradient_dict=None,
-                use_offload=False,
-                param_group_index=i,
-                partition_start=partition_id * partition_size,
-                partition_size=partition_size,
-                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
-                dp_group=self.real_dp_process_group[i])
+            link_hp_params(lp_param_list=self.bf16_groups[i],
+                           flat_hp_partition=flat_hp_partition,
+                           gradient_dict=self.fp32_groups_gradient_dict,
+                           offload_gradient_dict=None,
+                           use_offload=False,
+                           param_group_index=i,
+                           partition_start=partition_id * partition_size,
+                           partition_size=partition_size,
+                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                           dp_group=self.real_dp_process_group[i])
 
     def initialize_optimizer_states(self):
         """Take an optimizer step with zero-valued gradients to allocate internal
@@ -231,7 +204,8 @@ class BF16_Optimizer(ZeROOptimizer):
         This helps prevent memory fragmentation by allocating optimizer state at the
         beginning of training instead of after activations have been allocated.
         """
-        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition, self.fp32_groups_gradient_flat_partition):
+        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
+                                                   self.fp32_groups_gradient_flat_partition):
             param_partition.grad = grad_partition
 
         self.optimizer.step()
@@ -262,19 +236,17 @@ class BF16_Optimizer(ZeROOptimizer):
         if closure is not None:
             raise NotImplementedError(f'{self.__class__} does not support closure.')
 
-        all_groups_norm = get_global_norm_of_tensors(
-            input_tensors=self.get_grads_for_norm(),
-            mpu=self.mpu,
-            norm_type=self.norm_type)
+        all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(),
+                                                     mpu=self.mpu,
+                                                     norm_type=self.norm_type)
         self._global_grad_norm = all_groups_norm
 
         assert all_groups_norm > 0.
         if self.clip_grad > 0.:
-            clip_tensors_by_global_norm(
-                input_tensors=self.get_grads_for_norm(for_clipping=True),
-                max_norm=self.clip_grad,
-                global_norm=all_groups_norm,
-                mpu=self.mpu)
+            clip_tensors_by_global_norm(input_tensors=self.get_grads_for_norm(for_clipping=True),
+                                        max_norm=self.clip_grad,
+                                        global_norm=all_groups_norm,
+                                        mpu=self.mpu)
 
         self.optimizer.step()
 
@@ -343,7 +315,8 @@ class BF16_Optimizer(ZeROOptimizer):
 
     @torch.no_grad()
     def update_lp_params(self):
-        for i, (bf16_partitions, fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
+        for i, (bf16_partitions,
+                fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             bf16_partitions[partition_id].data.copy_(fp32_partition.data)
             # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
@@ -395,18 +368,11 @@ class BF16_Optimizer(ZeROOptimizer):
                         load_optimizer_states=True,
                         load_from_fp32_weights=False):
         if checkpoint_folder:
-            self._load_universal_checkpoint(checkpoint_folder,
-                                            load_optimizer_states,
-                                            load_from_fp32_weights)
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
-            self._load_legacy_checkpoint(state_dict_list,
-                                         load_optimizer_states,
-                                         load_from_fp32_weights)
+            self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
 
-    def _load_legacy_checkpoint(self,
-                                state_dict_list,
-                                load_optimizer_states=True,
-                                load_from_fp32_weights=False):
+    def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
 
         dp_rank = dist.get_rank(group=self.dp_process_group)
         current_rank_sd = state_dict_list[dp_rank]
@@ -421,17 +387,15 @@ class BF16_Optimizer(ZeROOptimizer):
             self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
 
         if load_from_fp32_weights:
-            for current, saved in zip(self.fp32_groups_flat_partition, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+            for current, saved in zip(self.fp32_groups_flat_partition,
+                                      current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
                 src_tensor = _get_padded_tensor(saved, current.numel())
                 current.data.copy_(src_tensor.data)
 
         if load_optimizer_states:
             self._link_all_hp_params()
 
-    def _load_universal_checkpoint(self,
-                                   checkpoint_folder,
-                                   load_optimizer_states,
-                                   load_from_fp32_weights):
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self._load_hp_checkpoint_state(checkpoint_folder)
 
     @property
@@ -448,11 +412,8 @@ class BF16_Optimizer(ZeROOptimizer):
             for lp in self.bf16_groups[i]:
                 if lp._hp_mapping is not None:
                     #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(
-                        os.path.join(checkpoint_dir,
-                                     self.param_names[lp]),
-                        tp_rank,
-                        tp_world_size)
+                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+                                                tp_world_size)
 
 
 def _get_padded_tensor(src_tensor, size):
diff --git a/deepspeed/runtime/checkpoint_engine/__init__.py b/deepspeed/runtime/checkpoint_engine/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/checkpoint_engine/__init__.py
+++ b/deepspeed/runtime/checkpoint_engine/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
index cb6decb8826eb22574971100d2ca015f504a90dc..3f8978df031678d10414d0d6bfd9028e3904aad6 100644
--- a/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/checkpoint_engine.py
@@ -1,4 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
 
 
 class CheckpointEngine(object):
@@ -11,6 +16,9 @@ class CheckpointEngine(object):
         # create checkpoint on give tag for save/load.
         pass
 
+    def makedirs(self, path, exist_ok=False):
+        os.makedirs(path, exist_ok=exist_ok)
+
     def save(self, state_dict, path: str):
         pass
 
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
index 42bb067d087dfa863d9a904b6e62a0dfe32257dc..4842d73d8a6611ba3942017e16d97189261d6f80 100644
--- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -15,6 +18,7 @@ def _get_tag_from_path(path):
 
 
 class NebulaCheckpointEngine(CheckpointEngine):
+
     def __init__(self, config_params=None):
         super().__init__(config_params)
         self.checkpoint = None
@@ -27,8 +31,7 @@ class NebulaCheckpointEngine(CheckpointEngine):
         nebula_config_params = {
             NEBULA_PERSISTENT_STORAGE_PATH: config_params.persistent_storage_path,
             NEBULA_PERSISTENT_TIME_INTERVAL: config_params.persistent_time_interval,
-            NEBULA_NUM_OF_VERSION_IN_RETENTION:
-            config_params.num_of_version_in_retention,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION: config_params.num_of_version_in_retention,
         }
         torch_nebula.init(**nebula_config_params)
 
@@ -54,16 +57,13 @@ class NebulaCheckpointEngine(CheckpointEngine):
         first_load_flag = self.tag_flag is None or self.tag_flag == tag
         if not self.enable_nebula_load and first_load_flag:
             self.tag_flag = tag
-            logger.info(
-                f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
+            logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
             partition = torch.load(path, map_location=map_location)
             logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
             return partition
 
         partititon_name = os.path.basename(path)
-        logger.info(
-            f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}..."
-        )
+        logger.info(f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}...")
 
         checkpoint = None
         if tag in (None, 'latest', 'latest_universal'):
@@ -72,44 +72,36 @@ class NebulaCheckpointEngine(CheckpointEngine):
             # will try to load the valid latest checkpoint from nebula(tier3 > tier1). So, in summary
             # when met failure loading for given tag, the loading priority would be like:
             #               nebula tier3 latest > nebula tier1 latest.
-            checkpoint = torch_nebula.get_latest_checkpoint(
-                persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
         else:
-            checkpoint = torch_nebula.get_checkpoint(tag=tag,
-                                                     persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_checkpoint(tag=tag, persist_path=self.nebula_load_path)
 
         if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
             logger.info(
                 f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again from nebula {self.nebula_load_path} path!"
             )
             # nebula tier3 latest
-            checkpoint = torch_nebula.get_latest_checkpoint(
-                persist_path=self.nebula_load_path)
+            checkpoint = torch_nebula.get_latest_checkpoint(persist_path=self.nebula_load_path)
             if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
                 logger.info(
                     f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!"
                 )
                 # nebula tier1 latest
                 checkpoint = torch_nebula.get_latest_checkpoint()
-                logger.warning(
-                    f"Unable to find valid checkpoint from Nebula under tag:{tag}.")
+                logger.warning(f"Unable to find valid checkpoint from Nebula under tag:{tag}.")
                 return None
 
         tag = checkpoint.tag
         self.tag_flag = -1
         partition = checkpoint.load(partititon_name, map_location=map_location)
-        logger.info(
-            f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.")
+        logger.info(f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.")
         return partition
 
     def commit(self, tag):
         # nebula commit will be call when all files under give tag are ready to be persisted in the async way.
-        logger.info(
-            f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting"
-        )
+        logger.info(f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting")
         commit_rls = self.checkpoint.commit()
         if not commit_rls:
-            logger.error(
-                f"[Nebula] failed to commit the checkpoint, please check the log.")
+            logger.error(f"[Nebula] failed to commit the checkpoint, please check the log.")
             return False
         return commit_rls
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5ee131800defea9bba99f7bdd7c499d838ee54d6..5cd44864bb2ea2fc7be3e903fa61044ed4a8e5ae 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.utils import logger, log_dist
@@ -7,11 +10,12 @@ from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
 
 
 class TorchCheckpointEngine(CheckpointEngine):
+
     def __init__(self, config_params=None):
         super().__init__(config_params)
 
     def create(self, tag):
-        log_dist(f"[Torch] Checkpoint {tag} is begin to save!", ranks=[0])
+        log_dist(f"[Torch] Checkpoint {tag} is about to be saved!", ranks=[0])
 
     def save(self, state_dict, path: str):
         logger.info(f"[Torch] Saving {path}...")
diff --git a/deepspeed/runtime/comm/__init__.py b/deepspeed/runtime/comm/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/comm/__init__.py
+++ b/deepspeed/runtime/comm/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 07302b1b7ae217e294dd5bb71221d0ba99bfac2f..36f0cb80781f568c47d492ffac1c4a9cd9d53d53 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -1,6 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""batched collective operations for overhead amortization and better
-bandwidth utilization"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+batched collective operations for overhead amortization and better
+bandwidth utilization
+"""
 
 import math
 from typing import List
@@ -15,15 +20,8 @@ import torch.nn.functional
 from deepspeed.utils import instrument_w_nvtx
 
 
-def _torch_reduce_scatter_fn(input_tensor: Tensor,
-                             output_tensor: Tensor,
-                             group=None,
-                             async_op=False,
-                             prof=False):
-    return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor,
-                                                     input_tensor,
-                                                     group=group,
-                                                     async_op=async_op)
+def _torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group=None, async_op=False, prof=False):
+    return instrument_w_nvtx(dist.reduce_scatter_fn)(output_tensor, input_tensor, group=group, async_op=async_op)
 
 
 @instrument_w_nvtx
@@ -45,13 +43,10 @@ def reduce_scatter_coalesced(
         flattened_tensor = tensor.view(-1)
         chunk_sz = math.ceil(tensor.numel() / world_sz)
         partition_lst_for_each_tensor[tensor_idx] = [
-            flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz]
-            for rank in range(0,
-                              world_sz)
+            flattened_tensor[rank * chunk_sz:rank * chunk_sz + chunk_sz] for rank in range(0, world_sz)
         ]
 
-    padded_partition_sz_for_each_tensor = tuple(
-        math.ceil(t.numel() / world_sz) for t in tensors)
+    padded_partition_sz_for_each_tensor = tuple(math.ceil(t.numel() / world_sz) for t in tensors)
 
     if len(tensors) == 1 and tensors[0].numel() % world_sz == 0:
         # if there's only one tensor being reduced and we don't need to pad
@@ -68,21 +63,15 @@ def reduce_scatter_coalesced(
                 tensor_partitions_lst_with_padding.append(tensor_chunk)
 
                 # add padding if necessary
-                padding_sz = padded_partition_sz_for_each_tensor[
-                    tensor_idx] - tensor_chunk.numel()
+                padding_sz = padded_partition_sz_for_each_tensor[tensor_idx] - tensor_chunk.numel()
                 if padding_sz > 0:
                     tensor_partitions_lst_with_padding.append(
-                        torch.empty(padding_sz,
-                                    dtype=tensor_chunk.dtype,
-                                    device=tensor_chunk.device))
+                        torch.empty(padding_sz, dtype=tensor_chunk.dtype, device=tensor_chunk.device))
 
-        tensor_partition_flat_buffer = instrument_w_nvtx(
-            torch.cat)(tensor_partitions_lst_with_padding)
+        tensor_partition_flat_buffer = instrument_w_nvtx(torch.cat)(tensor_partitions_lst_with_padding)
 
     tensor_partition_flat_buffer.div_(world_sz)  # pre-divide
-    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(
-        tensor_partition_flat_buffer,
-        world_sz)
+    tensor_partition_buffer_for_each_rank: List[Tensor] = torch.chunk(tensor_partition_flat_buffer, world_sz)
 
     # batched reduce-scatter call
     _torch_reduce_scatter_fn(tensor_partition_flat_buffer,
@@ -95,9 +84,7 @@ def reduce_scatter_coalesced(
     offset = 0
     for tensor_idx in range(len(tensors)):
         output_lst[tensor_idx] = tensor_partition_buffer_for_each_rank[this_rank].narrow(
-            0,
-            offset,
-            partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
+            0, offset, partition_lst_for_each_tensor[tensor_idx][this_rank].numel())
 
         offset += padded_partition_sz_for_each_tensor[tensor_idx]
 
diff --git a/deepspeed/runtime/comm/mpi.py b/deepspeed/runtime/comm/mpi.py
index 9e112bccc71d91abfed71ba30bca16fc8976971f..c2598f1e59867ce720e45a19969cec5bc500ffa7 100644
--- a/deepspeed/runtime/comm/mpi.py
+++ b/deepspeed/runtime/comm/mpi.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import cupy
@@ -12,6 +13,7 @@ from deepspeed.runtime.compression.cupy import CupyBackend
 
 
 class MpiBackend(object):
+
     def __init__(self, cuda_aware):
         self.comm = MPI.COMM_WORLD
         self.rank = self.comm.Get_rank()
@@ -31,49 +33,26 @@ class MpiBackend(object):
             req.append(comm.Isend(sendbuf, dest=root))
         return req
 
-    def gather_cuda(self,
-                    rank,
-                    world_size,
-                    comm,
-                    cupy_sign_list_packed,
-                    cupy_recvbuf_sign,
-                    cupy_worker_scale,
+    def gather_cuda(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale,
                     cupy_recvbuf_scale):
         # We do in-place operations on cupy buffers so we do not return any buffers
         requests = []
         for idx in range(world_size):
-            req_sign = self.my_igather(rank,
-                                       world_size,
-                                       comm,
-                                       cupy_sign_list_packed[idx],
-                                       cupy_recvbuf_sign,
-                                       root=idx)
+            req_sign = self.my_igather(rank, world_size, comm, cupy_sign_list_packed[idx], cupy_recvbuf_sign, root=idx)
             requests += req_sign
 
         for idx in range(world_size):
-            req_scale = self.my_igather(rank,
-                                        world_size,
-                                        comm,
-                                        cupy_worker_scale,
-                                        cupy_recvbuf_scale,
-                                        root=idx)
+            req_scale = self.my_igather(rank, world_size, comm, cupy_worker_scale, cupy_recvbuf_scale, root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
 
-    def gather_host(self,
-                    rank,
-                    world_size,
-                    comm,
-                    cupy_sign_list_packed,
-                    cupy_recvbuf_sign,
-                    cupy_worker_scale,
+    def gather_host(self, rank, world_size, comm, cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale,
                     cupy_recvbuf_scale):
 
         # In-place operations are not possible for newly created cupy arrays
         # so we need to return the new buffers
-        numpy_recvbuf_sign = np.zeros([world_size,
-                                       cupy_sign_list_packed[rank].size],
+        numpy_recvbuf_sign = np.zeros([world_size, cupy_sign_list_packed[rank].size],
                                       dtype=cupy_sign_list_packed[0].dtype)
         numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
 
@@ -101,12 +80,7 @@ class MpiBackend(object):
             requests += req_sign
 
         for idx in range(world_size):
-            req_scale = self.my_igather(rank,
-                                        world_size,
-                                        comm,
-                                        numpy_worker_scale,
-                                        numpy_recvbuf_scale,
-                                        root=idx)
+            req_scale = self.my_igather(rank, world_size, comm, numpy_worker_scale, numpy_recvbuf_scale, root=idx)
             requests += req_scale
 
         MPI.Request.Waitall(requests)
@@ -122,30 +96,18 @@ class MpiBackend(object):
 
         return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
 
-    def allgather_cuda(self,
-                       comm,
-                       cupy_server_sign_packed,
-                       cupy_recvbuf_sign_server,
-                       cupy_server_scale,
+    def allgather_cuda(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale,
                        cupy_recvbuf_scale_server):
         comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
         comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
 
-    def allgather_host(self,
-                       comm,
-                       cupy_server_sign_packed,
-                       cupy_recvbuf_sign_server,
-                       cupy_server_scale,
+    def allgather_host(self, comm, cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale,
                        cupy_recvbuf_scale_server):
 
         # 1. Convert cupy to numpy
-        numpy_recvbuf_sign_server = np.zeros(
-            [comm.Get_size(),
-             cupy_server_sign_packed.size],
-            dtype=cupy_server_sign_packed.dtype)
-        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
-                                               1],
-                                              dtype=cupy_server_scale.dtype)
+        numpy_recvbuf_sign_server = np.zeros([comm.Get_size(), cupy_server_sign_packed.size],
+                                             dtype=cupy_server_sign_packed.dtype)
+        numpy_recvbuf_scale_server = np.zeros([comm.Get_size(), 1], dtype=cupy_server_scale.dtype)
 
         numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
         numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
@@ -167,11 +129,7 @@ class MpiBackend(object):
 
         return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
 
-    def compressed_allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             local_rank):
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
 
         all_start_time = time.time()
         original_shape = buffer_m.size()
@@ -182,104 +140,71 @@ class MpiBackend(object):
         cupy.cuda.Device(local_rank).use()
 
         if original_size != worker_error_size:
-            empty_tensor = torch.zeros(worker_error_size - original_size,
-                                       device=buffer_m.device)
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
         worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
-        worker_error.set_(buffer_m - worker_scale *
-                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
-            self.size)
+            self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size)
         cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
 
-        cupy_recvbuf_sign = cupy.zeros(
-            [self.size,
-             cupy_sign_list_packed[self.rank].size],
-            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
         cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
         # Communication Phase 1
         gather_start = time.time()
         if self.cuda_aware:
-            self.gather_cuda(self.rank,
-                             self.size,
-                             self.comm,
-                             cupy_sign_list_packed,
-                             cupy_recvbuf_sign,
-                             cupy_worker_scale,
-                             cupy_recvbuf_scale)
+            self.gather_cuda(self.rank, self.size, self.comm, cupy_sign_list_packed, cupy_recvbuf_sign,
+                             cupy_worker_scale, cupy_recvbuf_scale)
         else:
-            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank,
-               self.size,
-               self.comm,
-               cupy_sign_list_packed,
-               cupy_recvbuf_sign,
-               cupy_worker_scale,
-               cupy_recvbuf_scale)
+            _, cupy_recvbuf_sign, _, cupy_recvbuf_scale = self.gather_host(self.rank, self.size, self.comm,
+                                                                           cupy_sign_list_packed, cupy_recvbuf_sign,
+                                                                           cupy_worker_scale, cupy_recvbuf_scale)
         gather_end = time.time()
 
         # cupy_sign_list_packed, cupy_worker_scale, worker_scale = None, None, None
         cupy_sign_list_packed = None
 
         compensated_server_m = self.compression_backend.cupy2torch(
-            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-                self.size,
-                -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(
-                        1 / self.size)).sum(0)
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                self.compression_backend.cupy2torch(cupy_recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        server_error.set_(
-            compensated_server_m - server_scale *
-            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
 
         cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-            self.compression_backend.torch2cupy(
-                compensated_server_m.sign_().add_(1).bool()),
-            1)
+            self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1)
         compensated_server_m = None
 
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [self.size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_recvbuf_sign.dtype)
-        cupy_recvbuf_scale_server = cupy.zeros([self.size,
-                                                1],
-                                               dtype=cupy_recvbuf_scale.dtype)
+        cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size],
+                                              dtype=cupy_recvbuf_sign.dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_recvbuf_scale.dtype)
         # cupy_recvbuf_sign, cupy_recvbuf_scale = None, None
         cupy_recvbuf_sign = None
 
         # Communication Phase 2
         if self.cuda_aware:
-            self.allgather_cuda(self.comm,
-                                cupy_server_sign_packed[0],
-                                cupy_recvbuf_sign_server,
-                                cupy_server_scale,
+            self.allgather_cuda(self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale,
                                 cupy_recvbuf_scale_server)
         else:
-            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(self.comm,
-                  cupy_server_sign_packed[0],
-                  cupy_recvbuf_sign_server,
-                  cupy_server_scale,
-                  cupy_recvbuf_scale_server)
+            _, cupy_recvbuf_sign_server, _, cupy_recvbuf_scale_server = self.allgather_host(
+                self.comm, cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale,
+                cupy_recvbuf_scale_server)
 
         # cupy_server_sign_packed, cupy_server_scale, server_scale = None, None, None
         cupy_server_sign_packed = None
 
         buffer_m.data.copy_(
-            self.compression_backend.cupy2torch(
-                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
-                    self.size,
-                    -1)).float().add_(-0.5).mul_(2.0).mul_(
-                        self.compression_backend.cupy2torch(
-                            cupy_recvbuf_scale_server)).flatten().data)
+            self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
         if len(original_shape) > 1:
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index fd99068dff82ba114e7c9034d6583b95f55614c2..0bd0d13619732ae0d96681890fcda35100904ba0 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed import comm as dist
@@ -12,6 +13,7 @@ from deepspeed.accelerator import get_accelerator
 
 
 class NcclBackend(object):
+
     def __init__(self, mpu=None):
         if mpu is None:
             self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
@@ -24,7 +26,7 @@ class NcclBackend(object):
         self.bool_not_supported = False
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
-        if TORCH_MAJOR >= 1 and TORCH_MINOR >= 10:
+        if (TORCH_MAJOR == 1 and TORCH_MINOR >= 10) or TORCH_MAJOR == 2:
             self.bool_not_supported = True
 
     def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
@@ -49,11 +51,7 @@ class NcclBackend(object):
         else:
             dist.send(sendbuf, group=group, dst=root)
 
-    def compressed_allreduce(self,
-                             buffer_m: torch.tensor,
-                             worker_error,
-                             server_error,
-                             local_rank):
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
 
         # all_start_time = time.time()
         original_shape = buffer_m.size()
@@ -64,53 +62,41 @@ class NcclBackend(object):
         cupy.cuda.Device(local_rank).use()
 
         if original_size != worker_error_size:
-            empty_tensor = torch.zeros(worker_error_size - original_size,
-                                       device=buffer_m.device)
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
             buffer_m = torch.cat([buffer_m, empty_tensor])
 
         buffer_m.add_(worker_error)
         worker_scale = torch.norm(buffer_m) / np.sqrt(buffer_m.numel())
-        worker_error.set_(buffer_m - worker_scale *
-                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         if self.bool_not_supported:
             cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
-                self.size)
+                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)), self.size)
         else:
             cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
-                self.size)
+                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()), self.size)
         cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)
 
-        cupy_recvbuf_sign = cupy.zeros(
-            [self.size,
-             cupy_sign_list_packed[self.rank].size],
-            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_sign = cupy.zeros([self.size, cupy_sign_list_packed[self.rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
         # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
 
         sign_list_packed = [
-            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx]) for idx in range(self.size)
         ]
 
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
         #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
         recvbuf_scale = [
-            torch.zeros(1,
-                        dtype=worker_scale.dtype,
-                        device=torch.device(get_accelerator().device_name(local_rank)))
+            torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(get_accelerator().device_name(local_rank)))
             for i in range(self.size)
         ]
 
         # communication phase 1
         # gather_start = time.time()
         # Alltoall for sign
-        dist.all_to_all_single(recvbuf_sign,
-                               torch.stack(sign_list_packed),
-                               group=self.world_group)
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group)
         # Allgather for scale
         dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
 
@@ -123,61 +109,44 @@ class NcclBackend(object):
         #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
 
         compensated_server_m = self.compression_backend.cupy2torch(
-            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
-                self.size,
-                -1)).float().add_(-0.5).mul_(2.0).mul_(
-                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
-        server_scale = torch.norm(compensated_server_m) / np.sqrt(
-            compensated_server_m.numel())
-        server_error.set_(
-            compensated_server_m - server_scale *
-            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
 
         # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)
 
         if self.bool_not_supported:
             cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
+                self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                 1)
         else:
             cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
-                self.compression_backend.torch2cupy(
-                    compensated_server_m.sign_().add_(1).bool()),
-                1)
+                self.compression_backend.torch2cupy(compensated_server_m.sign_().add_(1).bool()), 1)
         compensated_server_m = None
 
-        cupy_recvbuf_sign_server = cupy.zeros(
-            [self.size,
-             cupy_server_sign_packed[0].size],
-            dtype=cupy_recvbuf_sign.dtype)
+        cupy_recvbuf_sign_server = cupy.zeros([self.size, cupy_server_sign_packed[0].size],
+                                              dtype=cupy_recvbuf_sign.dtype)
         # cupy_recvbuf_sign, recvbuf_sign = None, None
         cupy_recvbuf_sign = None
 
-        server_sign_packed = [
-            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
-        ]
+        server_sign_packed = [self.compression_backend.cupy2torch(cupy_server_sign_packed[0])]
         recvbuf_sign_server = [
-            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx]) for idx in range(self.size)
         ]
 
         # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
-        cupy_recvbuf_scale_server = cupy.zeros([self.size,
-                                                1],
-                                               dtype=cupy_worker_scale.dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)
         # cupy_recvbuf_scale, recvbuf_scale = None, None
 
         recvbuf_scale_server = [
-            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
-            for idx in range(self.size)
+            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx]) for idx in range(self.size)
         ]
 
         # Communication Phase 2
-        dist.all_gather(recvbuf_sign_server,
-                        server_sign_packed[0],
-                        group=self.world_group)
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group)
         dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
 
         cupy_server_sign_packed = None
@@ -186,16 +155,12 @@ class NcclBackend(object):
         # dist.all_gather only provides a tensor list as the recv/output buffer
         recvbuf_sign_server = torch.stack(recvbuf_sign_server)
 
-        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
-            recvbuf_sign_server)
+        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(recvbuf_sign_server)
 
         buffer_m.data.copy_(
-            self.compression_backend.cupy2torch(
-                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
-                    self.size,
-                    -1)).float().add_(-0.5).mul_(2.0).mul_(
-                        self.compression_backend.cupy2torch(
-                            cupy_recvbuf_scale_server)).flatten().data)
+            self.compression_backend.cupy2torch((cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
+                self.size, -1)).float().add_(-0.5).mul_(2.0).mul_(
+                    self.compression_backend.cupy2torch(cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]
         if len(original_shape) > 1:
diff --git a/deepspeed/runtime/compression/__init__.py b/deepspeed/runtime/compression/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/compression/__init__.py
+++ b/deepspeed/runtime/compression/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py
index 68e56c68e9d0f31dfc275440ec49eecc2864f518..b959a9c2037203e3306755d11078e042e0a1b2be 100644
--- a/deepspeed/runtime/compression/cupy.py
+++ b/deepspeed/runtime/compression/cupy.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import cupy
 from torch.utils.dlpack import to_dlpack
@@ -8,6 +9,7 @@ from torch.utils.dlpack import from_dlpack
 
 
 class CupyBackend(object):
+
     def __init__(self):
         pass
 
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index 9da1058f2c8e91ea37873d378f4259cfa33780b8..dd9d77b6a158e90e682484f54209afed7a7fb620 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -1,10 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from typing import Union
+from enum import Enum
 
 import torch
 import json
@@ -30,6 +31,7 @@ from ..comm.config import DeepSpeedCommsConfig
 from ..monitor.config import get_monitor_config
 
 from deepspeed import comm as dist
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 from ..git_version_info import version as __version__
 from ..utils import logger
@@ -71,12 +73,7 @@ ONEBIT_ADAM_OPTIMIZER = 'onebitadam'
 ZERO_ONE_ADAM_OPTIMIZER = 'zerooneadam'
 ONEBIT_LAMB_OPTIMIZER = 'onebitlamb'
 DEEPSPEED_OPTIMIZERS = [
-    ADAGRAD_OPTIMIZER,
-    ADAM_OPTIMIZER,
-    ADAMW_OPTIMIZER,
-    LAMB_OPTIMIZER,
-    ONEBIT_ADAM_OPTIMIZER,
-    ONEBIT_LAMB_OPTIMIZER,
+    ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER,
     ZERO_ONE_ADAM_OPTIMIZER
 ]
 
@@ -92,11 +89,36 @@ class DeepSpeedConfigError(Exception):
     pass
 
 
+class DtypeEnum(Enum):
+    # The torch dtype must always be the first value (so we return torch.dtype)
+    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
+    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
+    int8 = torch.int8, "torch.int8", "int8"
+    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16"
+
+    # Copied from https://stackoverflow.com/a/43210118
+    # Allows us to use multiple values for each Enum index and returns first
+    # listed value when Enum is called
+    def __new__(cls, *values):
+        obj = object.__new__(cls)
+        # first value is canonical value
+        obj._value_ = values[0]
+        for other_value in values[1:]:
+            cls._value2member_map_[other_value] = obj
+        obj._all_values = values
+        return obj
+
+    def __repr__(self):
+        return "<%s.%s: %s>" % (
+            self.__class__.__name__,
+            self._name_,
+            ", ".join([repr(v) for v in self._all_values]),
+        )
+
+
 def get_pld_enabled(param_dict):
     if PROGRESSIVE_LAYER_DROP in param_dict.keys():
-        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP],
-                                PLD_ENABLED,
-                                PLD_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[PROGRESSIVE_LAYER_DROP], PLD_ENABLED, PLD_ENABLED_DEFAULT)
     else:
         return False
 
@@ -136,17 +158,13 @@ def get_fp16_enabled(param_dict):
 def get_bfloat16_enabled(param_dict):
     for key in [BFLOAT16, BFLOAT16_OLD]:
         if key in param_dict.keys():
-            return get_scalar_param(param_dict[key],
-                                    BFLOAT16_ENABLED,
-                                    BFLOAT16_ENABLED_DEFAULT)
+            return get_scalar_param(param_dict[key], BFLOAT16_ENABLED, BFLOAT16_ENABLED_DEFAULT)
     return False
 
 
 def get_fp16_master_weights_and_grads_enabled(param_dict):
     if get_fp16_enabled(param_dict):
-        return get_scalar_param(param_dict[FP16],
-                                FP16_MASTER_WEIGHTS_AND_GRADS,
-                                FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
+        return get_scalar_param(param_dict[FP16], FP16_MASTER_WEIGHTS_AND_GRADS, FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
     else:
         return False
 
@@ -158,9 +176,7 @@ def get_fp16_auto_cast(param_dict):
 
 def get_loss_scale(param_dict):
     if get_fp16_enabled(param_dict):
-        return get_scalar_param(param_dict[FP16],
-                                FP16_LOSS_SCALE,
-                                FP16_LOSS_SCALE_DEFAULT)
+        return get_scalar_param(param_dict[FP16], FP16_LOSS_SCALE, FP16_LOSS_SCALE_DEFAULT)
     elif get_bfloat16_enabled(param_dict):
         return 1.0
     else:
@@ -169,8 +185,7 @@ def get_loss_scale(param_dict):
 
 def get_initial_dynamic_scale(param_dict):
     if get_fp16_enabled(param_dict):
-        initial_scale_power = get_scalar_param(param_dict[FP16],
-                                               FP16_INITIAL_SCALE_POWER,
+        initial_scale_power = get_scalar_param(param_dict[FP16], FP16_INITIAL_SCALE_POWER,
                                                FP16_INITIAL_SCALE_POWER_DEFAULT)
     elif get_bfloat16_enabled(param_dict):
         initial_scale_power = 0
@@ -191,18 +206,10 @@ def get_dynamic_loss_scale_args(param_dict):
             FP16_HYSTERESIS,
         ]
         if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args):
-            init_scale = get_scalar_param(fp16_dict,
-                                          FP16_INITIAL_SCALE_POWER,
-                                          FP16_INITIAL_SCALE_POWER_DEFAULT)
-            scale_window = get_scalar_param(fp16_dict,
-                                            FP16_LOSS_SCALE_WINDOW,
-                                            FP16_LOSS_SCALE_WINDOW_DEFAULT)
-            delayed_shift = get_scalar_param(fp16_dict,
-                                             FP16_HYSTERESIS,
-                                             FP16_HYSTERESIS_DEFAULT)
-            min_loss_scale = get_scalar_param(fp16_dict,
-                                              FP16_MIN_LOSS_SCALE,
-                                              FP16_MIN_LOSS_SCALE_DEFAULT)
+            init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER, FP16_INITIAL_SCALE_POWER_DEFAULT)
+            scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW, FP16_LOSS_SCALE_WINDOW_DEFAULT)
+            delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS, FP16_HYSTERESIS_DEFAULT)
+            min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE, FP16_MIN_LOSS_SCALE_DEFAULT)
             loss_scale_args = {
                 INITIAL_LOSS_SCALE: 2**init_scale,
                 SCALE_WINDOW: scale_window,
@@ -214,9 +221,7 @@ def get_dynamic_loss_scale_args(param_dict):
 
 
 def get_gradient_accumulation_steps(param_dict):
-    return get_scalar_param(param_dict,
-                            GRADIENT_ACCUMULATION_STEPS,
-                            GRADIENT_ACCUMULATION_STEPS_DEFAULT)
+    return get_scalar_param(param_dict, GRADIENT_ACCUMULATION_STEPS, GRADIENT_ACCUMULATION_STEPS_DEFAULT)
 
 
 def get_sparse_gradients_enabled(param_dict):
@@ -224,9 +229,7 @@ def get_sparse_gradients_enabled(param_dict):
 
 
 def get_communication_data_type(param_dict):
-    val = get_scalar_param(param_dict,
-                           COMMUNICATION_DATA_TYPE,
-                           COMMUNICATION_DATA_TYPE_DEFAULT)
+    val = get_scalar_param(param_dict, COMMUNICATION_DATA_TYPE, COMMUNICATION_DATA_TYPE_DEFAULT)
     val = val.lower() if val is not None else val
     if val is None:
         return val  # we must determine it by other parameters
@@ -237,9 +240,7 @@ def get_communication_data_type(param_dict):
     elif val == "bfp16":
         return torch.bfloat16
 
-    raise ValueError(
-        f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}"
-    )
+    raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}")
 
 
 def get_prescale_gradients(param_dict):
@@ -247,9 +248,7 @@ def get_prescale_gradients(param_dict):
 
 
 def get_gradient_predivide_factor(param_dict):
-    return get_scalar_param(param_dict,
-                            GRADIENT_PREDIVIDE_FACTOR,
-                            GRADIENT_PREDIVIDE_FACTOR_DEFAULT)
+    return get_scalar_param(param_dict, GRADIENT_PREDIVIDE_FACTOR, GRADIENT_PREDIVIDE_FACTOR_DEFAULT)
 
 
 def get_steps_per_print(param_dict):
@@ -284,8 +283,7 @@ def get_sparse_attention(param_dict):
         elif mode == SPARSE_BSLONGFORMER_MODE:
             return get_sparse_bslongformer_config(sparsity)
         else:
-            raise NotImplementedError(
-                f"Given sparsity mode, {mode}, has not been implemented yet!")
+            raise NotImplementedError(f"Given sparsity mode, {mode}, has not been implemented yet!")
 
     else:
         return None
@@ -303,15 +301,9 @@ def get_sparse_fixed_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_local_blocks = get_scalar_param(sparsity,
-                                        SPARSE_NUM_LOCAL_BLOCKS,
-                                        SPARSE_NUM_LOCAL_BLOCKS_DEFAULT)
-    num_global_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_GLOBAL_BLOCKS,
-                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
-    attention = get_scalar_param(sparsity,
-                                 SPARSE_ATTENTION_TYPE,
-                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    num_local_blocks = get_scalar_param(sparsity, SPARSE_NUM_LOCAL_BLOCKS, SPARSE_NUM_LOCAL_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+    attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT)
     horizontal_global_attention = get_scalar_param(
         sparsity,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
@@ -342,23 +334,15 @@ def get_sparse_variable_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_random_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_RANDOM_BLOCKS,
-                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
-    local_window_blocks = get_scalar_param(sparsity,
-                                           SPARSE_LOCAL_WINDOW_BLOCKS,
-                                           SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT)
-    global_block_indices = get_scalar_param(sparsity,
-                                            SPARSE_GLOBAL_BLOCK_INDICES,
-                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    local_window_blocks = get_scalar_param(sparsity, SPARSE_LOCAL_WINDOW_BLOCKS, SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
     global_block_end_indices = get_scalar_param(
         sparsity,
         SPARSE_GLOBAL_BLOCK_END_INDICES,
         SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT,
     )
-    attention = get_scalar_param(sparsity,
-                                 SPARSE_ATTENTION_TYPE,
-                                 SPARSE_ATTENTION_TYPE_DEFAULT)
+    attention = get_scalar_param(sparsity, SPARSE_ATTENTION_TYPE, SPARSE_ATTENTION_TYPE_DEFAULT)
     horizontal_global_attention = get_scalar_param(
         sparsity,
         SPARSE_HORIZONTAL_GLOBAL_ATTENTION,
@@ -385,17 +369,13 @@ def get_sparse_bigbird_config(sparsity):
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD,
         SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT,
     )
-    num_random_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_RANDOM_BLOCKS,
-                                         SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
+    num_random_blocks = get_scalar_param(sparsity, SPARSE_NUM_RANDOM_BLOCKS, SPARSE_NUM_RANDOM_BLOCKS_DEFAULT)
     num_sliding_window_blocks = get_scalar_param(
         sparsity,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT,
     )
-    num_global_blocks = get_scalar_param(sparsity,
-                                         SPARSE_NUM_GLOBAL_BLOCKS,
-                                         SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
+    num_global_blocks = get_scalar_param(sparsity, SPARSE_NUM_GLOBAL_BLOCKS, SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT)
 
     return {
         SPARSE_MODE: SPARSE_BIGBIRD_MODE,
@@ -419,9 +399,7 @@ def get_sparse_bslongformer_config(sparsity):
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS,
         SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT,
     )
-    global_block_indices = get_scalar_param(sparsity,
-                                            SPARSE_GLOBAL_BLOCK_INDICES,
-                                            SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
+    global_block_indices = get_scalar_param(sparsity, SPARSE_GLOBAL_BLOCK_INDICES, SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT)
     global_block_end_indices = get_scalar_param(
         sparsity,
         SPARSE_GLOBAL_BLOCK_END_INDICES,
@@ -474,8 +452,7 @@ def get_optimizer_name(param_dict):
 
 
 def get_optimizer_params(param_dict):
-    if (get_optimizer_name(param_dict) is not None
-            and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()):
+    if (get_optimizer_name(param_dict) is not None and OPTIMIZER_PARAMS in param_dict[OPTIMIZER].keys()):
         return param_dict[OPTIMIZER][OPTIMIZER_PARAMS]
     else:
         return None
@@ -497,9 +474,11 @@ def get_optimizer_legacy_fusion(param_dict):
 
 
 def get_zero_allow_untested_optimizer(param_dict):
-    return get_scalar_param(param_dict,
-                            ZERO_ALLOW_UNTESTED_OPTIMIZER,
-                            ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
+    return get_scalar_param(param_dict, ZERO_ALLOW_UNTESTED_OPTIMIZER, ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
+
+
+def get_zero_force_ds_cpu_optimizer(param_dict):
+    return get_scalar_param(param_dict, ZERO_FORCE_DS_CPU_OPTIMIZER, ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT)
 
 
 def get_scheduler_name(param_dict):
@@ -510,8 +489,7 @@ def get_scheduler_name(param_dict):
 
 
 def get_scheduler_params(param_dict):
-    if (get_scheduler_name(param_dict) is not None
-            and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()):
+    if (get_scheduler_name(param_dict) is not None and SCHEDULER_PARAMS in param_dict[SCHEDULER].keys()):
         return param_dict[SCHEDULER][SCHEDULER_PARAMS]
     else:
         return None
@@ -530,15 +508,28 @@ def get_train_micro_batch_size_per_gpu(param_dict):
 
 
 def get_wall_clock_breakdown(param_dict):
-    return get_scalar_param(param_dict,
-                            WALL_CLOCK_BREAKDOWN,
-                            WALL_CLOCK_BREAKDOWN_DEFAULT)
+    return get_scalar_param(param_dict, WALL_CLOCK_BREAKDOWN, WALL_CLOCK_BREAKDOWN_DEFAULT)
 
 
 def get_memory_breakdown(param_dict):
     return get_scalar_param(param_dict, MEMORY_BREAKDOWN, MEMORY_BREAKDOWN_DEFAULT)
 
 
+class HybridEngineConfig(DeepSpeedConfigModel):
+    enabled: bool = False
+    max_out_tokens: int = 512
+    inference_tp_size: int = 1
+    release_inference_cache: bool = False
+    pin_parameters: bool = True
+    tp_gather_partition_size: int = 8
+
+
+def get_hybrid_engine_config(param_dict):
+    hybrid_engine_config_dict = param_dict.get("hybrid_engine", {})
+    hybrid_engine_config = HybridEngineConfig(**hybrid_engine_config_dict)
+    return hybrid_engine_config
+
+
 def get_eigenvalue_config(param_dict):
     if get_quantize_enabled(param_dict):
         param_dict = param_dict[QUANTIZE_TRAINING]
@@ -568,45 +559,35 @@ def get_eigenvalue_config(param_dict):
 
 def get_eigenvalue_enabled(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_ENABLED,
-                                EIGENVALUE_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_ENABLED, EIGENVALUE_ENABLED_DEFAULT)
     else:
         return EIGENVALUE_ENABLED_DEFAULT
 
 
 def get_eigenvalue_verbose(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_VERBOSE,
-                                EIGENVALUE_VERBOSE_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_VERBOSE, EIGENVALUE_VERBOSE_DEFAULT)
     else:
         return EIGENVALUE_VERBOSE_DEFAULT
 
 
 def get_eigenvalue_max_iter(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_MAX_ITER,
-                                EIGENVALUE_MAX_ITER_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_MAX_ITER, EIGENVALUE_MAX_ITER_DEFAULT)
     else:
         return EIGENVALUE_MAX_ITER_DEFAULT
 
 
 def get_eigenvalue_tol(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_TOL,
-                                EIGENVALUE_TOL_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_TOL, EIGENVALUE_TOL_DEFAULT)
     else:
         return EIGENVALUE_TOL_DEFAULT
 
 
 def get_eigenvalue_stability(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_STABILITY,
-                                EIGENVALUE_STABILITY_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_STABILITY, EIGENVALUE_STABILITY_DEFAULT)
     else:
         return EIGENVALUE_STABILITY_DEFAULT
 
@@ -624,18 +605,14 @@ def get_eigenvalue_gas_boundary_resolution(param_dict):
 
 def get_eigenvalue_layer_name(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_LAYER_NAME,
-                                EIGENVALUE_LAYER_NAME_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NAME, EIGENVALUE_LAYER_NAME_DEFAULT)
     else:
         return EIGENVALUE_LAYER_NAME_DEFAULT
 
 
 def get_eigenvalue_layer_num(param_dict):
     if EIGENVALUE in param_dict.keys():
-        return get_scalar_param(param_dict[EIGENVALUE],
-                                EIGENVALUE_LAYER_NUM,
-                                EIGENVALUE_LAYER_NUM_DEFAULT)
+        return get_scalar_param(param_dict[EIGENVALUE], EIGENVALUE_LAYER_NUM, EIGENVALUE_LAYER_NUM_DEFAULT)
     else:
         return EIGENVALUE_LAYER_NUM_DEFAULT
 
@@ -649,35 +626,29 @@ def get_data_types_params(param_dict):
 
 
 def get_checkpoint_tag_validation_mode(checkpoint_params):
-    tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION,
-                                                CHECKPOINT_TAG_VALIDATION_DEFAULT)
+    tag_validation_mode = checkpoint_params.get(CHECKPOINT_TAG_VALIDATION, CHECKPOINT_TAG_VALIDATION_DEFAULT)
     tag_validation_mode = tag_validation_mode.upper()
     if tag_validation_mode in CHECKPOINT_TAG_VALIDATION_MODES:
         return tag_validation_mode
     else:
         raise DeepSpeedConfigError(
             "Checkpoint config contains invalid tag_validation "
-            f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}"
-        )
+            f"value of {tag_validation_mode}, expecting one of {CHECKPOINT_TAG_VALIDATION_MODES}")
 
 
 def get_checkpoint_parallel_write_pipeline(checkpoint_params):
     par_write_params = checkpoint_params.get(CHECKPOINT_PARALLEL_WRITE, {})
-    par_write_pipeline = par_write_params.get(
-        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE,
-        CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT)
+    par_write_pipeline = par_write_params.get(CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE,
+                                              CHECKPOINT_PARALLEL_WRITE_PIPELINE_STAGE_DEFAULT)
     if par_write_pipeline in [True, False]:
         return par_write_pipeline
     else:
-        raise DeepSpeedConfigError(
-            "checkpoint::parallel_write::pipeline_stage "
-            f"value of '{par_write_pipeline}' is invalid, expecting: true or false")
+        raise DeepSpeedConfigError("checkpoint::parallel_write::pipeline_stage "
+                                   f"value of '{par_write_pipeline}' is invalid, expecting: true or false")
 
 
 def get_dataloader_drop_last(param_dict):
-    return get_scalar_param(param_dict,
-                            DATALOADER_DROP_LAST,
-                            DATALOADER_DROP_LAST_DEFAULT)
+    return get_scalar_param(param_dict, DATALOADER_DROP_LAST, DATALOADER_DROP_LAST_DEFAULT)
 
 
 '''Write deepspeed config files by modifying basic templates.
@@ -685,6 +656,7 @@ Can be used for quickly changing parameters via command line parameters.'''
 
 
 class DeepSpeedConfigWriter:
+
     def __init__(self, data=None):
         self.data = data if data is not None else {}
 
@@ -692,9 +664,7 @@ class DeepSpeedConfigWriter:
         self.data[key] = value
 
     def load_config(self, filename):
-        self.data = json.load(open(filename,
-                                   "r"),
-                              object_pairs_hook=dict_raise_error_on_duplicate_keys)
+        self.data = json.load(open(filename, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
 
     def write_config(self, filename):
         with open(filename, "w") as outfile:
@@ -702,15 +672,13 @@ class DeepSpeedConfigWriter:
 
 
 class DeepSpeedConfig(object):
+
     def __init__(self, config: Union[str, dict], mpu=None):
         super(DeepSpeedConfig, self).__init__()
         if isinstance(config, dict):
             self._param_dict = config
         elif os.path.exists(config):
-            self._param_dict = hjson.load(
-                open(config,
-                     "r"),
-                object_pairs_hook=dict_raise_error_on_duplicate_keys)
+            self._param_dict = hjson.load(open(config, "r"), object_pairs_hook=dict_raise_error_on_duplicate_keys)
         else:
             try:
                 config_decoded = base64.urlsafe_b64decode(config).decode('utf-8')
@@ -744,24 +712,18 @@ class DeepSpeedConfig(object):
             # Ensure the resource scheduler saw the same elastic config we are using at runtime
             ensure_immutable_elastic_config(runtime_elastic_config_dict=elastic_dict)
 
-            self.elastic_model_parallel_size = elastic_dict.get(
-                MODEL_PARLLEL_SIZE,
-                MODEL_PARLLEL_SIZE_DEFAULT)
+            self.elastic_model_parallel_size = elastic_dict.get(MODEL_PARLLEL_SIZE, MODEL_PARLLEL_SIZE_DEFAULT)
             if self.elastic_model_parallel_size < 1:
-                raise ElasticityConfigError(
-                    "Model-Parallel size cannot be less than 1, "
-                    f"given model-parallel size: {self.elastic_model_parallel_size}")
+                raise ElasticityConfigError("Model-Parallel size cannot be less than 1, "
+                                            f"given model-parallel size: {self.elastic_model_parallel_size}")
 
-            self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE,
-                                                      NUM_GPUS_PER_NODE_DEFAULT)
+            self.num_gpus_per_node = elastic_dict.get(NUM_GPUS_PER_NODE, NUM_GPUS_PER_NODE_DEFAULT)
             if self.num_gpus_per_node < 1:
-                raise ElasticityConfigError(
-                    "NUmber of GPUs per node cannot be less than 1, "
-                    f"given number of GPUs per node: {self.num_gpus_per_node}")
+                raise ElasticityConfigError("NUmber of GPUs per node cannot be less than 1, "
+                                            f"given number of GPUs per node: {self.num_gpus_per_node}")
 
-            ignore_non_elastic_batch_info = elastic_dict.get(
-                IGNORE_NON_ELASTIC_BATCH_INFO,
-                IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+            ignore_non_elastic_batch_info = elastic_dict.get(IGNORE_NON_ELASTIC_BATCH_INFO,
+                                                             IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
 
             if not ignore_non_elastic_batch_info:
                 batch_params = [
@@ -779,23 +741,17 @@ class DeepSpeedConfig(object):
 
             # micro_bsz * world_size * gas = total_batch_size
             # gas = total_batch_size // (micro_bsz * world_size)
-            gradient_accu_steps = final_batch_size // (micro_batch_size *
-                                                       self.world_size)
+            gradient_accu_steps = final_batch_size // (micro_batch_size * self.world_size)
 
             if TRAIN_BATCH_SIZE in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding training_batch_size: "
-                    f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}")
+                logger.warning("[Elasticity] overriding training_batch_size: "
+                               f"{self._param_dict[TRAIN_BATCH_SIZE]} -> {final_batch_size}")
             if TRAIN_MICRO_BATCH_SIZE_PER_GPU in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding train_micro_batch_size_per_gpu: "
-                    f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}"
-                )
+                logger.warning("[Elasticity] overriding train_micro_batch_size_per_gpu: "
+                               f"{self._param_dict[TRAIN_MICRO_BATCH_SIZE_PER_GPU]} -> {micro_batch_size}")
             if GRADIENT_ACCUMULATION_STEPS in self._param_dict:
-                logger.warning(
-                    "[Elasticity] overriding gradient_accumulation_steps: "
-                    f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}"
-                )
+                logger.warning("[Elasticity] overriding gradient_accumulation_steps: "
+                               f"{self._param_dict[GRADIENT_ACCUMULATION_STEPS]} -> {gradient_accu_steps}")
 
             logger.info(f"[Elasticity] valid GPU counts: {valid_gpus}")
 
@@ -811,8 +767,7 @@ class DeepSpeedConfig(object):
     def _initialize_params(self, param_dict):
         self.train_batch_size = get_train_batch_size(param_dict)
         #print(f"beginning get_train_batch_size = {get_train_batch_size}")
-        self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(
-            param_dict)
+        self.train_micro_batch_size_per_gpu = get_train_micro_batch_size_per_gpu(param_dict)
         self.gradient_accumulation_steps = get_gradient_accumulation_steps(param_dict)
         self.steps_per_print = get_steps_per_print(param_dict)
         self.dump_state = get_dump_state(param_dict)
@@ -824,11 +779,12 @@ class DeepSpeedConfig(object):
         self.sparse_gradients_enabled = get_sparse_gradients_enabled(param_dict)
 
         self.zero_config = get_zero_config(param_dict)
+        self.mics_shard_size = self.zero_config.mics_shard_size
+        self.mics_hierarchial_params_gather = self.zero_config.mics_hierarchical_params_gather
         self.zero_optimization_stage = self.zero_config.stage
         self.zero_enabled = self.zero_optimization_stage > 0
 
-        self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(
-            param_dict)
+        self.activation_checkpointing_config = DeepSpeedActivationCheckpointingConfig(param_dict)
 
         self.comms_config = DeepSpeedCommsConfig(param_dict)
         self.monitor_config = get_monitor_config(param_dict)
@@ -837,9 +793,9 @@ class DeepSpeedConfig(object):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.fp16_auto_cast = get_fp16_auto_cast(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
-        assert not (self.fp16_enabled and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
-        self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(
-            param_dict)
+        assert not (self.fp16_enabled
+                    and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
+        self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict)
         self.amp_enabled = get_amp_enabled(param_dict)
         self.amp_params = get_amp_params(param_dict)
         self.loss_scale = get_loss_scale(param_dict)
@@ -849,22 +805,21 @@ class DeepSpeedConfig(object):
         self.compression_config = get_compression_config(param_dict)
 
         self.optimizer_name = get_optimizer_name(param_dict)
-        if (self.optimizer_name is not None
-                and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS):
+        if (self.optimizer_name is not None and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS):
             self.optimizer_name = self.optimizer_name.lower()
 
         self.optimizer_params = get_optimizer_params(param_dict)
         self.optimizer_legacy_fusion = get_optimizer_legacy_fusion(param_dict)
 
-        self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(
-            param_dict)
+        self.zero_allow_untested_optimizer = get_zero_allow_untested_optimizer(param_dict)
+
+        self.zero_force_ds_cpu_optimizer = get_zero_force_ds_cpu_optimizer(param_dict)
 
         self.scheduler_name = get_scheduler_name(param_dict)
         self.scheduler_params = get_scheduler_params(param_dict)
 
         self.flops_profiler_config = DeepSpeedFlopsProfilerConfig(param_dict)
-        self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict)
-                                     | self.flops_profiler_config.enabled)
+        self.wall_clock_breakdown = (get_wall_clock_breakdown(param_dict) | self.flops_profiler_config.enabled)
         self.memory_breakdown = get_memory_breakdown(param_dict)
         self.autotuning_config = DeepSpeedAutotuningConfig(param_dict)
 
@@ -879,6 +834,8 @@ class DeepSpeedConfig(object):
             self.eigenvalue_layer_num,
         ) = get_eigenvalue_config(param_dict)
 
+        self.hybrid_engine = get_hybrid_engine_config(param_dict)
+
         self.sparse_attention = get_sparse_attention(param_dict)
         self.pipeline = get_pipeline_config(param_dict)
 
@@ -893,20 +850,16 @@ class DeepSpeedConfig(object):
 
         checkpoint_params = get_checkpoint_params(param_dict)
         validation_mode = get_checkpoint_tag_validation_mode(checkpoint_params)
-        self.checkpoint_tag_validation_enabled = (validation_mode !=
-                                                  ValidationMode.IGNORE)
+        self.checkpoint_tag_validation_enabled = (validation_mode != ValidationMode.IGNORE)
         self.checkpoint_tag_validation_fail = validation_mode == ValidationMode.FAIL
-        self.load_universal_checkpoint = checkpoint_params.get(
-            LOAD_UNIVERSAL_CHECKPOINT,
-            LOAD_UNIVERSAL_CHECKPOINT_DEFAULT)
+        self.load_universal_checkpoint = checkpoint_params.get(LOAD_UNIVERSAL_CHECKPOINT,
+                                                               LOAD_UNIVERSAL_CHECKPOINT_DEFAULT)
 
-        self.use_node_local_storage = checkpoint_params.get(
-            USE_NODE_LOCAL_STORAGE_CHECKPOINT,
-            USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT)
+        self.use_node_local_storage = checkpoint_params.get(USE_NODE_LOCAL_STORAGE_CHECKPOINT,
+                                                            USE_NODE_LOCAL_STORAGE_CHECKPOINT_DEFAULT)
 
         data_types_params = get_data_types_params(param_dict)
-        self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE,
-                                                      GRAD_ACCUM_DTYPE_DEFAULT)
+        self.grad_accum_dtype = data_types_params.get(GRAD_ACCUM_DTYPE, GRAD_ACCUM_DTYPE_DEFAULT)
 
         par_write_pipe = get_checkpoint_parallel_write_pipeline(checkpoint_params)
         self.checkpoint_parallel_write_pipeline = par_write_pipe
@@ -923,23 +876,16 @@ class DeepSpeedConfig(object):
         micro_batch = self.train_micro_batch_size_per_gpu
         grad_acc = self.gradient_accumulation_steps
 
-        assert (
-            train_batch > 0
-        ), f"Train batch size: {train_batch} has to be greater than 0"
+        assert (train_batch > 0), f"Train batch size: {train_batch} has to be greater than 0"
 
-        assert (
-            micro_batch > 0
-        ), f"Micro batch size per gpu: {micro_batch} has to be greater than 0"
+        assert (micro_batch > 0), f"Micro batch size per gpu: {micro_batch} has to be greater than 0"
 
-        assert (
-            grad_acc > 0
-        ), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
+        assert (grad_acc > 0), f"Gradient accumulation steps: {grad_acc} has to be greater than 0"
 
         assert train_batch == micro_batch * grad_acc * self.world_size, (
             f"Check batch related parameters. train_batch_size is not equal "
             "to micro_batch_per_gpu * gradient_acc_step * world_size "
-            f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}"
-        )
+            f"{train_batch} != {micro_batch} * {grad_acc} * {self.world_size}")
 
     def _set_batch_related_parameters(self):
 
@@ -1002,8 +948,7 @@ class DeepSpeedConfig(object):
                 sort_keys=True,
                 indent=4,
                 cls=ScientificNotationEncoder,
-                separators=(",",
-                            ":"),
+                separators=(",", ":"),
             )))
 
     def print(self, name):
@@ -1016,20 +961,16 @@ class DeepSpeedConfig(object):
         self.print_user_config()
 
     def _do_error_check(self):
-        assert (
-            self.train_micro_batch_size_per_gpu
-        ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
+        assert (self.train_micro_batch_size_per_gpu
+                ), "DeepSpeedConfig: {} is not defined".format(TRAIN_MICRO_BATCH_SIZE_PER_GPU)
 
         assert (
-            self.gradient_accumulation_steps
-        ), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
+            self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert (
-                self.zero_optimization_stage <= ZeroStageEnum.max_stage
-            ), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
-                ZeroStageEnum.max_stage
-            )
+            assert (self.zero_optimization_stage <=
+                    ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
+                        ZeroStageEnum.max_stage)
 
         if self.fp16_master_weights_and_gradients:
             assert self.zero_enabled and self.zero_optimization_stage == ZeroStageEnum.gradients, "Fp16_master_weights_and_grads is only supported with ZeRO Stage 2 for now."
@@ -1040,19 +981,15 @@ class DeepSpeedConfig(object):
         vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
         if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:
             logger.warning(
-                "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization."
-                .format(vocabulary_size,
-                        TENSOR_CORE_ALIGN_SIZE))
+                "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization.".format(
+                    vocabulary_size, TENSOR_CORE_ALIGN_SIZE))
 
-        if (self.optimizer_params is not None
-                and MAX_GRAD_NORM in self.optimizer_params.keys()
+        if (self.optimizer_params is not None and MAX_GRAD_NORM in self.optimizer_params.keys()
                 and self.optimizer_params[MAX_GRAD_NORM] > 0):
             if fp16_enabled:
                 if self.global_rank == 0:
-                    logger.warning(
-                        "DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper"
-                        .format(MAX_GRAD_NORM,
-                                self.optimizer_params[MAX_GRAD_NORM]))
+                    logger.warning("DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper".format(
+                        MAX_GRAD_NORM, self.optimizer_params[MAX_GRAD_NORM]))
             else:
                 if self.global_rank == 0:
                     logger.warning(
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index cd262f14a6dd4ae072def8a2ecdd4a3cbc190574..0fb1372deac8661865bc7636945dd4b9474d6390 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
 Collection of DeepSpeed configuration utilities
 """
@@ -50,15 +49,10 @@ class DeepSpeedConfigModel(BaseModel):
                                       new_param='my_new_field',
                                       new_param_fn=(lambda x: int(x)))
     """
+
     def __init__(self, strict=False, **data):
-        if (
-                not strict
-        ):  # This is temporary until we refactor all DS configs, allows HF to load models
-            data = {
-                k: v
-                for k,
-                v in data.items() if (v != "auto" or k == "replace_method")
-            }
+        if (not strict):  # This is temporary until we refactor all DS configs, allows HF to load models
+            data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")}
         super().__init__(**data)
         self._deprecated_fields_check(self)
 
@@ -73,8 +67,7 @@ class DeepSpeedConfigModel(BaseModel):
         dep_msg = kwargs.get("deprecated_msg", "")
         if dep_param in fields_set:
             logger.warning(f"Config parameter {dep_param} is deprecated" +
-                           (f" use {new_param} instead" if new_param else "") +
-                           (f". {dep_msg}" if dep_msg else ""))
+                           (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else ""))
             # Check if there is a new param and if it should be set with a value
             if new_param and kwargs.get("set_new_param", True):
                 # Remove the deprecate field if there is a replacing field
@@ -89,9 +82,7 @@ class DeepSpeedConfigModel(BaseModel):
                 if len(new_param_nested) > 1:
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
-                    pydantic_config = reduce(getattr,
-                                             new_param_nested[:-1],
-                                             pydantic_config)
+                    pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
                     fields_set = pydantic_config.__fields_set__
                 new_param_name = new_param_nested[-1]
                 assert (
@@ -101,9 +92,7 @@ class DeepSpeedConfigModel(BaseModel):
                 try:
                     setattr(pydantic_config, new_param_name, param_value)
                 except Exception as e:
-                    logger.error(
-                        f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'"
-                    )
+                    logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'")
                     raise e
 
     def _deprecated_fields_check(self, pydantic_config):
@@ -121,12 +110,20 @@ class DeepSpeedConfigModel(BaseModel):
         arbitrary_types_allowed = True
 
 
+def get_config_default(config, field_name):
+    assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}"
+    assert not config.__fields__.get(
+        field_name).required, f"'{field_name}' is a required field and does not have a default value"
+    return config.__fields__.get(field_name).default
+
+
 class pp_int(int):
     """
     A wrapper for integers that will return a custom string or comma-formatted
     string of the integer. For example, print(pp_int(1e5)) will return
     "10,000". This is useful mainly for auto-generated documentation purposes.
     """
+
     def __new__(cls, val, custom_print_str=None):
         inst = super().__new__(cls, val)
         inst.custom_print_str = custom_print_str
@@ -148,6 +145,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
     Just pass ``cls=ScientificNotationEncoder`` to ``json.dumps`` to activate it
 
     """
+
     def iterencode(self, o, _one_shot=False, level=0):
         indent = self.indent if self.indent is not None else 4
         prefix_close = " " * level * indent
@@ -161,10 +159,7 @@ class ScientificNotationEncoder(json.JSONEncoder):
             else:
                 return f"{o}"
         elif isinstance(o, collections.abc.Mapping):
-            x = [
-                f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k,
-                v in o.items()
-            ]
+            x = [f'\n{prefix}"{k}": {self.iterencode(v, level=level)}' for k, v in o.items()]
             return "{" + ", ".join(x) + f"\n{prefix_close}" + "}"
         elif isinstance(o, collections.abc.Sequence) and not isinstance(o, str):
             return f"[{ f', '.join(map(self.iterencode, o)) }]"
@@ -175,6 +170,7 @@ class DeepSpeedConfigObject(object):
     """
     For json serialization
     """
+
     def repr(self):
         return self.__dict__
 
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 6925745a8e5a022843ade33f3f19dd706e2b5771..62b46e2a6ce9ecb587fdfbe85424f7908be72f3d 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #############################################
 # Routes
@@ -73,6 +72,8 @@ MAX_GRAD_NORM = 'max_grad_norm'
 #############################################
 ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
 ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False
+ZERO_FORCE_DS_CPU_OPTIMIZER = "zero_force_ds_cpu_optimizer"
+ZERO_FORCE_DS_CPU_OPTIMIZER_DEFAULT = True
 
 # Steps
 STEPS_PER_PRINT = "steps_per_print"
@@ -368,11 +369,7 @@ class ValidationMode:
 CHECKPOINT = "checkpoint"
 CHECKPOINT_TAG_VALIDATION = "tag_validation"
 CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
-CHECKPOINT_TAG_VALIDATION_MODES = [
-    ValidationMode.WARN,
-    ValidationMode.IGNORE,
-    ValidationMode.FAIL
-]
+CHECKPOINT_TAG_VALIDATION_MODES = [ValidationMode.WARN, ValidationMode.IGNORE, ValidationMode.FAIL]
 
 LOAD_UNIVERSAL_CHECKPOINT = "load_universal"
 LOAD_UNIVERSAL_CHECKPOINT_DEFAULT = False
diff --git a/deepspeed/runtime/csr_tensor.py b/deepspeed/runtime/csr_tensor.py
deleted file mode 100644
index d8ee1e76c08323229f9bf58213368f293733420b..0000000000000000000000000000000000000000
--- a/deepspeed/runtime/csr_tensor.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-
-Implementation of a compressed sparse row (CSR) tensor. Similar in
-functionality to TensorFlow's IndexedSlices implementation.
-"""
-
-import torch
-
-
-class CSRTensor(object):
-    """ Compressed Sparse Row (CSR) Tensor """
-    def __init__(self, dense_tensor=None):
-        self.orig_dense_tensor = dense_tensor
-        if dense_tensor is not None:
-            result = torch.sum(dense_tensor, dim=1)
-            self.indices = result.nonzero().flatten()
-            self.values = dense_tensor[self.indices]
-            self.dense_size = list(dense_tensor.size())
-        else:
-            self.indices = None
-            self.values = None
-            self.dense_size = None
-
-    @staticmethod
-    def type():
-        return "deepspeed.CSRTensor"
-
-    def to_dense(self):
-        it = self.indices.unsqueeze(1)
-        full_indices = torch.cat([it for _ in range(self.dense_size[1])], dim=1)
-        return self.values.new_zeros(self.dense_size).scatter_add_(
-            0,
-            full_indices,
-            self.values)
-
-    def sparse_size(self):
-        index_size = list(self.indices.size())
-        index_size = index_size[0]
-        value_size = list(self.values.size())
-        value_size = value_size[0] * value_size[1]
-        dense_size = self.dense_size[0] * self.dense_size[1]
-        return index_size + value_size, dense_size
-
-    def add(self, b):
-        assert self.dense_size == b.dense_size
-        self.indices = torch.cat([self.indices, b.indices])
-        self.values = torch.cat([self.values, b.values])
-
-    def __str__(self):
-        sparse_size, dense_size = self.sparse_size()
-        return "DeepSpeed.CSRTensor(indices_size={}, values_size={}, " \
-               "dense_size={}, device={}, reduction_factor={})".format(
-            self.indices.size(), self.values.size(), self.dense_size,
-            self.indices.get_device(), dense_size / sparse_size
-        )
-
-    def __repr__(self):
-        return self.__str__()
diff --git a/deepspeed/runtime/data_pipeline/__init__.py b/deepspeed/runtime/data_pipeline/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/data_pipeline/__init__.py
+++ b/deepspeed/runtime/data_pipeline/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/config.py b/deepspeed/runtime/data_pipeline/config.py
index eefa1402e9aa6e3e585a4661a7b886d9bc3fa037..6234805189254ba4fe3d7d8aac8e7062de721c51 100644
--- a/deepspeed/runtime/data_pipeline/config.py
+++ b/deepspeed/runtime/data_pipeline/config.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .constants import *
 import copy
 from ..config_utils import get_scalar_param
@@ -24,18 +26,14 @@ def get_data_efficiency_config(param_dict):
 
 def get_data_efficiency_enabled(param_dict):
     if DATA_EFFICIENCY in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_EFFICIENCY],
-                                DATA_EFFICIENCY_ENABLED,
-                                DATA_EFFICIENCY_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_ENABLED, DATA_EFFICIENCY_ENABLED_DEFAULT)
     else:
         return False
 
 
 def get_data_efficiency_seed(param_dict):
     if DATA_EFFICIENCY in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_EFFICIENCY],
-                                DATA_EFFICIENCY_SEED,
-                                DATA_EFFICIENCY_SEED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_EFFICIENCY], DATA_EFFICIENCY_SEED, DATA_EFFICIENCY_SEED_DEFAULT)
     else:
         return DATA_EFFICIENCY_SEED_DEFAULT
 
@@ -55,26 +53,21 @@ def get_data_sampling(param_dict):
 
 def get_data_sampling_enabled(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_ENABLED,
-                                DATA_SAMPLING_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_ENABLED, DATA_SAMPLING_ENABLED_DEFAULT)
     else:
         return False
 
 
 def get_data_sampling_num_epochs(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_NUM_EPOCHS,
-                                DATA_SAMPLING_NUM_EPOCHS_DEFAULT)
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_EPOCHS, DATA_SAMPLING_NUM_EPOCHS_DEFAULT)
     else:
         return DATA_SAMPLING_NUM_EPOCHS_DEFAULT
 
 
 def get_data_sampling_num_workers(param_dict):
     if DATA_SAMPLING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_SAMPLING],
-                                DATA_SAMPLING_NUM_WORKERS,
+        return get_scalar_param(param_dict[DATA_SAMPLING], DATA_SAMPLING_NUM_WORKERS,
                                 DATA_SAMPLING_NUM_WORKERS_DEFAULT)
     else:
         return DATA_SAMPLING_NUM_WORKERS_DEFAULT
@@ -87,7 +80,8 @@ def get_curriculum_learning(param_dict):
         param_dict[CURRICULUM_LEARNING] = {}
     sub_param_dict = param_dict[CURRICULUM_LEARNING]
     if output[CURRICULUM_LEARNING_ENABLED]:
-        assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified"
+        assert CURRICULUM_LEARNING_METRICS in sub_param_dict.keys(
+        ), f"Curriculum learning is enabled, {CURRICULUM_LEARNING_METRICS} must be specified"
         for key, val in get_curriculum_learning_params(param_dict).items():
             output[key] = val
     return output
@@ -95,8 +89,7 @@ def get_curriculum_learning(param_dict):
 
 def get_curriculum_learning_enabled(param_dict):
     if CURRICULUM_LEARNING in param_dict.keys():
-        return get_scalar_param(param_dict[CURRICULUM_LEARNING],
-                                CURRICULUM_LEARNING_ENABLED,
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING], CURRICULUM_LEARNING_ENABLED,
                                 CURRICULUM_LEARNING_ENABLED_DEFAULT)
     else:
         return False
@@ -113,8 +106,7 @@ def get_curriculum_learning_params(param_dict):
 
 def get_curriculum_enabled_legacy(param_dict):
     if CURRICULUM_LEARNING_LEGACY in param_dict.keys():
-        return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY],
-                                CURRICULUM_ENABLED_LEGACY,
+        return get_scalar_param(param_dict[CURRICULUM_LEARNING_LEGACY], CURRICULUM_ENABLED_LEGACY,
                                 CURRICULUM_ENABLED_DEFAULT_LEGACY)
     else:
         return False
@@ -142,9 +134,7 @@ def get_data_routing(param_dict):
 
 def get_data_routing_enabled(param_dict):
     if DATA_ROUTING in param_dict.keys():
-        return get_scalar_param(param_dict[DATA_ROUTING],
-                                DATA_ROUTING_ENABLED,
-                                DATA_ROUTING_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[DATA_ROUTING], DATA_ROUTING_ENABLED, DATA_ROUTING_ENABLED_DEFAULT)
     else:
         return False
 
@@ -164,9 +154,7 @@ def get_random_ltd(param_dict):
 
 def get_random_ltd_enabled(param_dict):
     if RANDOM_LTD in param_dict.keys():
-        return get_scalar_param(param_dict[RANDOM_LTD],
-                                RANDOM_LTD_ENABLED,
-                                RANDOM_LTD_ENABLED_DEFAULT)
+        return get_scalar_param(param_dict[RANDOM_LTD], RANDOM_LTD_ENABLED, RANDOM_LTD_ENABLED_DEFAULT)
     else:
         return False
 
diff --git a/deepspeed/runtime/data_pipeline/constants.py b/deepspeed/runtime/data_pipeline/constants.py
index b801d2e956fc935d92698ac127051327164f202d..1ade640e38d949da8caa051ec40d9f096d83f064 100644
--- a/deepspeed/runtime/data_pipeline/constants.py
+++ b/deepspeed/runtime/data_pipeline/constants.py
@@ -1,10 +1,11 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
-#########################################
-# Data efficiency library
-# See sample config at https://www.deepspeed.ai/docs/config-json/#data-efficiency
-#########################################
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Data efficiency library
+ See sample config at https://www.deepspeed.ai/docs/config-json/data-efficiency
+"""
 DATA_EFFICIENCY = "data_efficiency"
 DATA_EFFICIENCY_ENABLED = "enabled"
 DATA_EFFICIENCY_ENABLED_DEFAULT = False
diff --git a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
index b4cb18c4476b98556cc72cce5ea9f7bd0b571f4a..23d747957dc4647e06fad0a94e5e4b071b6f6e23 100644
--- a/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
+++ b/deepspeed/runtime/data_pipeline/curriculum_scheduler.py
@@ -1,12 +1,15 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import math
 from deepspeed.utils import logger
 from .constants import *
 
 
 class CurriculumScheduler(object):
+
     def __init__(self, config):
         super().__init__()
         self.state = {}
@@ -16,17 +19,12 @@ class CurriculumScheduler(object):
             f"Curriculum learning requires the config '{CURRICULUM_LEARNING_MAX_DIFFICULTY}'"
         assert CURRICULUM_LEARNING_SCHEDULE_TYPE in config, \
             f"Curriculum learning requires the config '{CURRICULUM_LEARNING_SCHEDULE_TYPE}'"
-        self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MIN_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MAX_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[
-            CURRICULUM_LEARNING_MIN_DIFFICULTY]
-        self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[
-            CURRICULUM_LEARNING_SCHEDULE_TYPE]
+        self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] = config[CURRICULUM_LEARNING_MAX_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = config[CURRICULUM_LEARNING_MIN_DIFFICULTY]
+        self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] = config[CURRICULUM_LEARNING_SCHEDULE_TYPE]
         self.first_step = True
-        if config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
+        if config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             """
             The schedule_config is a list of difficulty and a list of max
             step belonging to each difficulty. Example json config:
@@ -43,18 +41,12 @@ class CurriculumScheduler(object):
                 f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY}'"
             assert CURRICULUM_LEARNING_SCHEDULE_MAX_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_discrete schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_MAX_STEP}'"
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0
-            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                       [CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len(
-                           config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-                           [CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) > 0
+            assert len(config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY]) == len(
+                config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_MAX_STEP]) + 1
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             """
             The schedule_config includes:
             total_curriculum_step: how many steps the curriculum learning takes to go
@@ -79,15 +71,12 @@ class CurriculumScheduler(object):
                 f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
             assert CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_root schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE}'"
-            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
-                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
                     f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             """
             The schedule_config is the same as CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT but without the
             root_degree.
@@ -100,15 +89,12 @@ class CurriculumScheduler(object):
                 f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP}'"
             assert CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP in config[CURRICULUM_LEARNING_SCHEDULE_CONFIG], \
                 f"Curriculum learning with fixed_linear schedule requires the schedule_config '{CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP}'"
-            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][
-                    CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
+            if config[CURRICULUM_LEARNING_SCHEDULE_CONFIG][CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP] % 8 != 0:
                 logger.warning(
                     f'When using seqlen metric, the difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your metric/hardware.'
                 )
-            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[
-                CURRICULUM_LEARNING_SCHEDULE_CONFIG]
-        elif config[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+            self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG] = config[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
+        elif config[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
             """
             Fully customized schedule. User need to provide a custom schedule
             function by using the set_custom_curriculum_learning_schedule API
@@ -145,38 +131,28 @@ class CurriculumScheduler(object):
         s_state = self.state[CURRICULUM_LEARNING_SCHEDULE_CONFIG]
         if root_degree is None:
             root_degree = s_state[CURRICULUM_LEARNING_SCHEDULE_ROOT_DEGREE]
-        next_difficulty = (float(global_steps) /
-                           s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(
-                               1.0 / root_degree)
-        next_difficulty = math.floor(next_difficulty *
-                                     (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] -
-                                      self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) +
-                                     self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY])
-        next_difficulty -= (next_difficulty %
-                            s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP])
-        next_difficulty = min(next_difficulty,
-                              self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY])
+        next_difficulty = (float(global_steps) / s_state[CURRICULUM_LEARNING_SCHEDULE_TOTAL_STEP])**(1.0 / root_degree)
+        next_difficulty = math.floor(
+            next_difficulty *
+            (self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY] - self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY]) +
+            self.state[CURRICULUM_LEARNING_MIN_DIFFICULTY])
+        next_difficulty -= (next_difficulty % s_state[CURRICULUM_LEARNING_SCHEDULE_DIFFICULTY_STEP])
+        next_difficulty = min(next_difficulty, self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY])
         return next_difficulty
 
     def get_difficulty(self, global_steps):
-        if self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
+        if self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_DISCRETE:
             return self.__fixed_discrete_get_difficulty(global_steps)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_LINEAR:
             return self.__fixed_root_get_difficulty(global_steps, 1)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_FIXED_ROOT:
             return self.__fixed_root_get_difficulty(global_steps)
-        elif self.state[
-                CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
+        elif self.state[CURRICULUM_LEARNING_SCHEDULE_TYPE] == CURRICULUM_LEARNING_SCHEDULE_CUSTOM:
             return self.custom_get_difficulty(global_steps)
         else:
             raise RuntimeError('Unsupported curriculum schedule type')
 
     def update_difficulty(self, global_steps):
-        if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[
-                CURRICULUM_LEARNING_MAX_DIFFICULTY]:
-            self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(
-                global_steps)
+        if self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] < self.state[CURRICULUM_LEARNING_MAX_DIFFICULTY]:
+            self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY] = self.get_difficulty(global_steps)
         return self.state[CURRICULUM_LEARNING_CURRENT_DIFFICULTY]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/__init__.py b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/__init__.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
index 436da95380e7e200dc53aea268a6334a21dbf33b..780a88c3d5a2fb016f2c59bbe3a88fbaff6499b0 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/basic_layer.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import logger
 from torch import Tensor
@@ -14,6 +15,7 @@ class RandomLayerTokenDrop(Module):
     """
     A  layer wrapper for random LTD
     """
+
     def __init__(self, layer: Module):
         super(RandomLayerTokenDrop, self).__init__()
         self.random_ltd_layer = layer
@@ -52,9 +54,7 @@ class RandomLayerTokenDrop(Module):
         elif self.model_type == 'decoder':
             self.index_generator = gpt_sample_tokens
         else:
-            logger.warning(
-                "************For now, we only support encoder-only or decoder-only models************"
-            )
+            logger.warning("************For now, we only support encoder-only or decoder-only models************")
             raise NotImplementedError
 
     def get_bsh(self, hidden_stats):
@@ -78,40 +78,36 @@ class RandomLayerTokenDrop(Module):
                                                                                       self.curr_micro_batch, \
                                                                                       self.random_ltd_num_layer, \
                                                                                       hidden_states.device, mask)
-                self.random_ltd_scheduler.state[
-                    RANDOM_LTD_SAMPLE_INDEX] = sampled_indices
-                self.random_ltd_scheduler.state[
-                    RANDOM_LTD_ATTENTION_MASK] = part_attention_mask
+                self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX] = sampled_indices
+                self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK] = part_attention_mask
             else:
-                sampled_indices = self.random_ltd_scheduler.state[
-                    RANDOM_LTD_SAMPLE_INDEX]
-                part_attention_mask = self.random_ltd_scheduler.state[
-                    RANDOM_LTD_ATTENTION_MASK]
-
+                sampled_indices = self.random_ltd_scheduler.state[RANDOM_LTD_SAMPLE_INDEX]
+                part_attention_mask = self.random_ltd_scheduler.state[RANDOM_LTD_ATTENTION_MASK]
 
-            hidden_states, part_hidden_states = GatherTokens.apply(hidden_states, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+            hidden_states, part_hidden_states = GatherTokens.apply(hidden_states,
+                                                                   sampled_indices[self.random_ltd_layer_id, :, :],
+                                                                   self.batch_first)
             if self.mask_name is not None:
                 if self.model_type == 'encoder':
-                    kwargs[self.mask_name] = part_attention_mask[
-                        self.random_ltd_layer_id]
+                    kwargs[self.mask_name] = part_attention_mask[self.random_ltd_layer_id]
                 else:
                     kwargs[self.mask_name] = part_attention_mask
 
             outputs = self.random_ltd_layer(part_hidden_states, **kwargs)
 
             if isinstance(outputs, tuple):
-                hidden_states = ScatterTokens.apply(hidden_states, outputs[0], sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                hidden_states = ScatterTokens.apply(hidden_states, outputs[0],
+                                                    sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first)
                 my_list = list(outputs)
                 my_list[0] = hidden_states
                 return tuple(my_list)
             elif isinstance(outputs, Tensor):
-                hidden_states = ScatterTokens.apply(hidden_states, outputs, sampled_indices[self.random_ltd_layer_id,:,:], self.batch_first)
+                hidden_states = ScatterTokens.apply(hidden_states, outputs,
+                                                    sampled_indices[self.random_ltd_layer_id, :, :], self.batch_first)
                 return hidden_states
             else:
-                logger.warning(
-                    "************For now, we only support tuple and tensor output.  \
-                       You need to adjust the output according to the layer in your model************"
-                )
+                logger.warning("************For now, we only support tuple and tensor output.  \
+                       You need to adjust the output according to the layer in your model************")
                 raise NotImplementedError
         else:
             return self.random_ltd_layer(hidden_states, **kwargs)
diff --git a/deepspeed/runtime/data_pipeline/data_routing/helper.py b/deepspeed/runtime/data_pipeline/data_routing/helper.py
index 0f3791cf6e57b96246c960880bc33594bcef20ae..150182d77bcfda20b1aa1aabd4b8785542ca9d1b 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/helper.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/helper.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .basic_layer import RandomLayerTokenDrop
 from collections import OrderedDict
diff --git a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
index db0a7d4bce99c5aedad08035f777c134b4205e52..8a0b031d4f633976f438d5151973e58afa77712e 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/scheduler.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 
@@ -12,6 +13,7 @@ from ..constants import *
 
 
 class BaseScheduler(object):
+
     def __init__(self):
         self.state = {}
 
@@ -19,12 +21,9 @@ class BaseScheduler(object):
         s_state = self.state[RANDOM_LTD_SCHEDULE_CONFIG]
         if root_degree is None:
             root_degree = s_state['root_degree']
-        next_seq = (float(global_steps) /
-                    s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree)
-        next_seq = math.floor(
-            next_seq *
-            (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) +
-            self.state[RANDOM_LTD_MIN_VALUE])
+        next_seq = (float(global_steps) / s_state[RANDOM_LTD_REQUIRE_STEP])**(1.0 / root_degree)
+        next_seq = math.floor(next_seq * (self.state[RANDOM_LTD_MAX_VALUE] - self.state[RANDOM_LTD_MIN_VALUE]) +
+                              self.state[RANDOM_LTD_MIN_VALUE])
         next_seq -= (next_seq % s_state[RANDOM_LTD_INCREASE_STEP])
         next_seq = min(next_seq, self.state[RANDOM_LTD_MAX_VALUE])
         return next_seq
@@ -37,6 +36,7 @@ class BaseScheduler(object):
 
 
 class RandomLTDScheduler(BaseScheduler):
+
     def __init__(self, config):
         super().__init__()
         self.model_layer_num = config[RANDOM_LTD_TOTAL_LAYER_NUM]
@@ -61,12 +61,9 @@ class RandomLTDScheduler(BaseScheduler):
         if self.config_schedule is not None:
             self.state[RANDOM_LTD_MIN_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE]
             self.state[RANDOM_LTD_MAX_VALUE] = self.config_schedule[RANDOM_LTD_MAX_VALUE]
-            self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[
-                RANDOM_LTD_MIN_VALUE]
-            self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[
-                RANDOM_LTD_SCHEDULE_CONFIG]
-            self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[
-                RANDOM_LTD_SCHEDULER_TYPE]
+            self.state[RANDOM_LTD_CURRENT_VALUE] = self.config_schedule[RANDOM_LTD_MIN_VALUE]
+            self.state[RANDOM_LTD_SCHEDULE_CONFIG] = self.config_schedule[RANDOM_LTD_SCHEDULE_CONFIG]
+            self.state[RANDOM_LTD_SCHEDULER_TYPE] = self.config_schedule[RANDOM_LTD_SCHEDULER_TYPE]
         self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = 0
         self.state[RANDOM_LTD_CURR_STEP] = -1
 
@@ -95,8 +92,7 @@ class RandomLTDScheduler(BaseScheduler):
 
     def state_dict(self):
         return {
-            RANDOM_LTD_CONSUMED_LAYER_TOKENS:
-            self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS],
+            RANDOM_LTD_CONSUMED_LAYER_TOKENS: self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS],
             RANDOM_LTD_CURR_STEP: self.state[RANDOM_LTD_CURR_STEP],
             RANDOM_LTD_CURRENT_VALUE: self.state[RANDOM_LTD_CURRENT_VALUE],
             RANDOM_LTD_MIN_VALUE: self.state[RANDOM_LTD_MIN_VALUE],
@@ -104,8 +100,7 @@ class RandomLTDScheduler(BaseScheduler):
         }
 
     def load_state_dict(self, state_dict):
-        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[
-            RANDOM_LTD_CONSUMED_LAYER_TOKENS]
+        self.state[RANDOM_LTD_CONSUMED_LAYER_TOKENS] = state_dict[RANDOM_LTD_CONSUMED_LAYER_TOKENS]
         self.state[RANDOM_LTD_CURR_STEP] = state_dict[RANDOM_LTD_CURR_STEP]
         self.state[RANDOM_LTD_CURRENT_VALUE] = state_dict[RANDOM_LTD_CURRENT_VALUE]
         self.state[RANDOM_LTD_MIN_VALUE] = state_dict[RANDOM_LTD_MIN_VALUE]
diff --git a/deepspeed/runtime/data_pipeline/data_routing/utils.py b/deepspeed/runtime/data_pipeline/data_routing/utils.py
index 8b7bd501c75f4b8f11b6b991d7835536dd303206..afcfef2ef4dc04279b7fa3ee49916519f33ffbec 100644
--- a/deepspeed/runtime/data_pipeline/data_routing/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_routing/utils.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -10,8 +11,7 @@ def bsh_decoder_gather(reserved_length, hidden_states, mask):
     rand_list = []
     part_hidden_states = []  #  batch, seq, hidden ## different from megatron
     for k in range(hidden_states.size(0)):
-        B_tmp = torch.randperm(hidden_states.size(1),
-                               device=hidden_states.device)[:reserved_length]
+        B_tmp = torch.randperm(hidden_states.size(1), device=hidden_states.device)[:reserved_length]
         B = B_tmp.sort()[0]
         rand_list.append(B)
         part_hidden_states.append(hidden_states[k:k + 1, B, :])
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/__init__.py b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
index efd198f1fb041c307afdde0712f79c53da1a5533..556a6fd1ddcacf1623a6aff67bd7d8d7f32e7fc4 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from collections import defaultdict
 import csv
@@ -16,6 +18,7 @@ from .utils import split_dataset, split_index, create_mmap_dataset_builder, clos
 
 
 class DataAnalyzer(object):
+
     def __init__(self,
                  dataset,
                  num_workers=1,
@@ -53,25 +56,19 @@ class DataAnalyzer(object):
         self.custom_map_finalize = custom_map_finalize
         self.custom_reduce = custom_reduce
 
-    def init_metric_results(self,
-                            thread_id,
-                            metric_names,
-                            metric_types,
-                            metric_dtypes,
-                            save_path,
-                            worker_id):
+    def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id):
         metric_results = []
         for m_idx in range(len(metric_names)):
             metric_name, metric_type, metric_dtype = metric_names[m_idx], \
                 metric_types[m_idx], metric_dtypes[m_idx]
-            assert metric_dtype not in [np.float64, np.double], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
+            assert metric_dtype not in [
+                np.float64, np.double
+            ], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
             metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/"
             os.makedirs(metric_save_path, exist_ok=True)
             if metric_type == 'single_value_per_sample':
                 sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
-                sample_to_metric_builder = create_mmap_dataset_builder(
-                    sample_to_metric_fname,
-                    metric_dtype)
+                sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_dtype)
                 metric_to_sample_fname = f"{metric_save_path}/{metric_name}_metric_to_sample"
                 os.system(f"rm -rf {metric_to_sample_fname}*")
                 metric_to_sample_dict = defaultdict(list)
@@ -84,34 +81,25 @@ class DataAnalyzer(object):
             elif metric_type == 'accumulate_value_over_samples':
                 metric_value = None
                 metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
-                metric_results.append({
-                    "metric_value": metric_value,
-                    "metric_value_fname": metric_value_fname
-                })
+                metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname})
         return metric_results
 
-    def update_metric_results(self,
-                              data,
-                              metric_types,
-                              metric_functions,
-                              metric_results):
+    def update_metric_results(self, data, metric_types, metric_functions, metric_results):
         for m_idx in range(len(metric_types)):
             metric_type, metric_function, metric_result = metric_types[m_idx], \
                 metric_functions[m_idx], metric_results[m_idx]
             if metric_type == 'single_value_per_sample':
                 metric_values = metric_function(data)
                 for row in range(metric_values.size()[0]):
-                    metric_result["sample_to_metric_builder"].add_item(
-                        metric_values[row].reshape(-1))
-                    metric_result["metric_to_sample_dict"][
-                        metric_values[row].item()].append(data['index'][row][0].item())
+                    metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1))
+                    metric_result["metric_to_sample_dict"][metric_values[row].item()].append(
+                        data['index'][row][0].item())
                 for m_value in metric_result["metric_to_sample_dict"]:
                     if len(metric_result["metric_to_sample_dict"][m_value]) > 100:
                         metric_fname = metric_result["metric_to_sample_fname"]
                         with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
                             writer = csv.writer(f)
-                            writer.writerows(
-                                [metric_result["metric_to_sample_dict"][m_value]])
+                            writer.writerows([metric_result["metric_to_sample_dict"][m_value]])
                         metric_result["metric_to_sample_dict"][m_value] = []
             elif metric_type == 'accumulate_value_over_samples':
                 metric_values = metric_function(data)
@@ -126,25 +114,20 @@ class DataAnalyzer(object):
                 metric_dtypes[m_idx], metric_results[m_idx]
             if metric_type == 'single_value_per_sample':
                 metric_fname = metric_result["sample_to_metric_fname"]
-                close_mmap_dataset_builder(metric_result["sample_to_metric_builder"],
-                                           metric_fname)
+                close_mmap_dataset_builder(metric_result["sample_to_metric_builder"], metric_fname)
                 for m_value in metric_result["metric_to_sample_dict"]:
                     if len(metric_result["metric_to_sample_dict"][m_value]) > 0:
                         metric_fname = metric_result["metric_to_sample_fname"]
                         with open(f"{metric_fname}_{m_value}.csv", 'a') as f:
                             writer = csv.writer(f)
-                            writer.writerows(
-                                [metric_result["metric_to_sample_dict"][m_value]])
+                            writer.writerows([metric_result["metric_to_sample_dict"][m_value]])
                         metric_result["metric_to_sample_dict"][m_value] = []
             elif metric_type == 'accumulate_value_over_samples':
                 if metric_result["metric_value"] is not None:
-                    metric_value_builder = create_mmap_dataset_builder(
-                        metric_result["metric_value_fname"],
-                        metric_dtype)
-                    metric_value_builder.add_item(
-                        metric_result["metric_value"].reshape(-1))
-                    close_mmap_dataset_builder(metric_value_builder,
-                                               metric_result["metric_value_fname"])
+                    metric_value_builder = create_mmap_dataset_builder(metric_result["metric_value_fname"],
+                                                                       metric_dtype)
+                    metric_value_builder.add_item(metric_result["metric_value"].reshape(-1))
+                    close_mmap_dataset_builder(metric_value_builder, metric_result["metric_value_fname"])
 
     def run_map_helper(self, thread_id):
         start_idx, end_idx = self.thread_splits[thread_id][0], \
@@ -152,15 +135,9 @@ class DataAnalyzer(object):
         logger.info(f"worker {self.worker_id} thread {thread_id}: start working " \
             f"on data subset {start_idx} to {end_idx}")
         thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx)))
-        sampler = BatchSampler(SequentialSampler(thread_dataset),
-                               batch_size=self.batch_size,
-                               drop_last=False)
+        sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False)
         if self.collate_fn is None:
-            iterator = iter(
-                DataLoader(thread_dataset,
-                           batch_sampler=sampler,
-                           num_workers=0,
-                           pin_memory=False))
+            iterator = iter(DataLoader(thread_dataset, batch_sampler=sampler, num_workers=0, pin_memory=False))
         else:
             iterator = iter(
                 DataLoader(thread_dataset,
@@ -169,19 +146,11 @@ class DataAnalyzer(object):
                            collate_fn=self.collate_fn,
                            pin_memory=False))
         if self.custom_map_init is None:
-            metric_results = self.init_metric_results(thread_id,
-                                                      self.metric_names,
-                                                      self.metric_types,
-                                                      self.metric_dtypes,
-                                                      self.save_path,
-                                                      self.worker_id)
+            metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types,
+                                                      self.metric_dtypes, self.save_path, self.worker_id)
         else:
-            metric_results = self.custom_map_init(thread_id,
-                                                  self.metric_names,
-                                                  self.metric_types,
-                                                  self.metric_dtypes,
-                                                  self.save_path,
-                                                  self.worker_id)
+            metric_results = self.custom_map_init(thread_id, self.metric_names, self.metric_types, self.metric_dtypes,
+                                                  self.save_path, self.worker_id)
         total_sample = len(thread_dataset)
         processed_sample = 0
         start = time.time()
@@ -189,15 +158,9 @@ class DataAnalyzer(object):
             try:
                 data = next(iterator)
                 if self.custom_map_update is None:
-                    self.update_metric_results(data,
-                                               self.metric_types,
-                                               self.metric_functions,
-                                               metric_results)
+                    self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results)
                 else:
-                    self.custom_map_update(data,
-                                           self.metric_types,
-                                           self.metric_functions,
-                                           metric_results)
+                    self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results)
                 processed_sample += self.batch_size
                 duration = (time.time() - start) / 3600.0
                 remain_duration = duration * total_sample / processed_sample - duration
@@ -206,22 +169,17 @@ class DataAnalyzer(object):
                     f"out of {total_sample} processed in {duration:.2f} hr, " \
                     f"estimated to finish in {remain_duration:.2f} hr")
             except StopIteration:
-                logger.info(
-                    f"worker {self.worker_id} thread {thread_id}: reach end of file")
+                logger.info(f"worker {self.worker_id} thread {thread_id}: reach end of file")
                 break
         if self.custom_map_finalize is None:
-            self.finalize_metric_results(self.metric_types,
-                                         self.metric_dtypes,
-                                         metric_results)
+            self.finalize_metric_results(self.metric_types, self.metric_dtypes, metric_results)
         else:
-            self.custom_map_finalize(self.metric_types,
-                                     self.metric_dtypes,
-                                     metric_results)
+            self.custom_map_finalize(self.metric_types, self.metric_dtypes, metric_results)
         logger.info(f"worker {self.worker_id} thread {thread_id}: finished")
 
     def run_map(self):
-        self.worker_splits, self.thread_splits = split_dataset(self.dataset,
-            self.num_workers, self.worker_id, self.num_threads)
+        self.worker_splits, self.thread_splits = split_dataset(self.dataset, self.num_workers, self.worker_id,
+                                                               self.num_threads)
         if len(self.specific_threads) > 0:
             threads_to_run = self.specific_threads
         else:
@@ -238,81 +196,50 @@ class DataAnalyzer(object):
             assert self.num_threads == 1
             self.run_map_helper(0)
 
-    def get_metric_value_percentiles(self,
-                                     metric_name,
-                                     num_sample_per_value,
-                                     total_num_samples):
+    def get_metric_value_percentiles(self, metric_name, num_sample_per_value, total_num_samples):
         logger.info(f"Checking the value percentiles of metric {metric_name}...")
         processed_samples = 0
         current_percentile = 5
         for key in sorted(num_sample_per_value.keys()):
             processed_samples += num_sample_per_value[key]
             if processed_samples >= total_num_samples * current_percentile / 100.0:
-                logger.info(
-                    f"Metric {metric_name} {current_percentile}th percentile: {key}")
+                logger.info(f"Metric {metric_name} {current_percentile}th percentile: {key}")
                 current_percentile += 5
 
-    def merge_gather_map_stats(self,
-                               num_workers,
-                               num_threads,
-                               num_threads_reduce,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               return_dict):
+    def merge_gather_map_stats(self, num_workers, num_threads, num_threads_reduce, t_idx_reduce, metric_save_path,
+                               metric_name, return_dict):
         results = []
         for w_idx in range(num_workers):
             for t_idx in range(num_threads):
                 if (w_idx * num_threads + t_idx) % num_threads_reduce == t_idx_reduce:
                     w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
                     w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
-                    w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname,
-                                                            skip_warmup=True)
+                    w_sample_to_metric = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True)
                     unique_v = list(np.unique(w_sample_to_metric))
                     sample_to_metric_count = len(w_sample_to_metric)
-                    logger.info(
-                        f"Finished gathering map stats from worker {w_idx} thread {t_idx}."
-                    )
+                    logger.info(f"Finished gathering map stats from worker {w_idx} thread {t_idx}.")
                     results.append([unique_v, sample_to_metric_count])
         return_dict[t_idx_reduce] = results
 
-    def merge_sample_to_metric(self,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               metric_value_dtype,
+    def merge_sample_to_metric(self, t_idx_reduce, metric_save_path, metric_name, metric_value_dtype,
                                map_worker_thread):
         sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
-        sample_to_metric_builder = create_mmap_dataset_builder(
-            sample_to_metric_fname,
-            metric_value_dtype)
+        sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype)
         for w_t in map_worker_thread:
             w_metric_save_path = f"{metric_save_path}/worker{w_t[0]}_thread{w_t[1]}/"
             w_sample_to_metric_fname = f"{w_metric_save_path}/{metric_name}_sample_to_metric"
             w_data = MMapIndexedDataset(w_sample_to_metric_fname, skip_warmup=True)
             for row in range(len(w_data)):
-                sample_to_metric_builder.add_item(
-                    torch.tensor(w_data[row].astype(np.int64),
-                                 dtype=torch.long))
-            logger.info(
-                f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.")
+                sample_to_metric_builder.add_item(torch.tensor(w_data[row].astype(np.int64), dtype=torch.long))
+            logger.info(f"Finished merge_sample_to_metric from worker {w_t[0]} thread {w_t[1]}.")
         close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname)
 
-    def merge_metric_to_sample(self,
-                               t_idx_reduce,
-                               metric_save_path,
-                               metric_name,
-                               sample_idx_dtype,
-                               metric_value_dtype,
-                               unique_metric_values,
-                               num_workers,
-                               num_threads):
+    def merge_metric_to_sample(self, t_idx_reduce, metric_save_path, metric_name, sample_idx_dtype, metric_value_dtype,
+                               unique_metric_values, num_workers, num_threads):
         index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
-        index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname,
-                                                              sample_idx_dtype)
+        index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype)
         index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
-        index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname,
-                                                              metric_value_dtype)
+        index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype)
         for unique_v in unique_metric_values:
             samples = []
             for w_idx in range(num_workers):
@@ -330,13 +257,7 @@ class DataAnalyzer(object):
         close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
         close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
 
-    def merge_map_results(self,
-                          dataset,
-                          metric_names,
-                          metric_types,
-                          save_path,
-                          num_workers,
-                          num_threads,
+    def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_workers, num_threads,
                           num_threads_reduce):
         total_num_samples = len(dataset)
         sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1)
@@ -385,9 +306,7 @@ class DataAnalyzer(object):
                 for w_idx in range(num_workers):
                     for t_idx in range(num_threads):
                         map_worker_thread.append([w_idx, t_idx])
-                thread_splits = split_index(0,
-                                            len(map_worker_thread),
-                                            num_threads_reduce)
+                thread_splits = split_index(0, len(map_worker_thread), num_threads_reduce)
                 p = []
                 for t_idx_reduce in range(num_threads_reduce):
                     start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
@@ -405,24 +324,18 @@ class DataAnalyzer(object):
                     p[t_idx_reduce].join()
 
                 sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
-                sample_to_metric_builder = create_mmap_dataset_builder(
-                    sample_to_metric_fname,
-                    metric_value_dtype)
+                sample_to_metric_builder = create_mmap_dataset_builder(sample_to_metric_fname, metric_value_dtype)
                 for t_idx_reduce in range(num_threads_reduce):
                     chunk_fname = f"{metric_save_path}/{metric_name}_sample_to_metric_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_fname}")
                     sample_to_metric_builder.merge_file_(chunk_fname)
-                close_mmap_dataset_builder(sample_to_metric_builder,
-                                           sample_to_metric_fname)
-                sample_to_metric = MMapIndexedDataset(sample_to_metric_fname,
-                                                      skip_warmup=True)
+                close_mmap_dataset_builder(sample_to_metric_builder, sample_to_metric_fname)
+                sample_to_metric = MMapIndexedDataset(sample_to_metric_fname, skip_warmup=True)
                 assert len(sample_to_metric) == total_num_samples
 
                 # metric_to_sample
                 unique_metric_values = list(sorted(unique_metric_values))
-                thread_splits = split_index(0,
-                                            len(unique_metric_values),
-                                            num_threads_reduce)
+                thread_splits = split_index(0, len(unique_metric_values), num_threads_reduce)
                 p = []
                 for t_idx_reduce in range(num_threads_reduce):
                     start_idx, end_idx = thread_splits[t_idx_reduce][0], thread_splits[t_idx_reduce][1]
@@ -442,13 +355,9 @@ class DataAnalyzer(object):
                 for t_idx_reduce in range(num_threads_reduce):
                     p[t_idx_reduce].join()
                 index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample"
-                index_to_sample_builder = create_mmap_dataset_builder(
-                    index_to_sample_fname,
-                    sample_idx_dtype)
+                index_to_sample_builder = create_mmap_dataset_builder(index_to_sample_fname, sample_idx_dtype)
                 index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric"
-                index_to_metric_builder = create_mmap_dataset_builder(
-                    index_to_metric_fname,
-                    metric_value_dtype)
+                index_to_metric_builder = create_mmap_dataset_builder(index_to_metric_fname, metric_value_dtype)
                 for t_idx_reduce in range(num_threads_reduce):
                     chunk_is_fname = f"{metric_save_path}/{metric_name}_index_to_sample_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_is_fname}")
@@ -456,43 +365,29 @@ class DataAnalyzer(object):
                     chunk_im_fname = f"{metric_save_path}/{metric_name}_index_to_metric_thread{t_idx_reduce}"
                     logger.info(f"Merging file {chunk_im_fname}")
                     index_to_metric_builder.merge_file_(chunk_im_fname)
-                close_mmap_dataset_builder(index_to_sample_builder,
-                                           index_to_sample_fname)
-                close_mmap_dataset_builder(index_to_metric_builder,
-                                           index_to_metric_fname)
+                close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
+                close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
                 num_sample_per_value = {}
-                index_to_sample = MMapIndexedDataset(index_to_sample_fname,
-                                                     skip_warmup=True)
-                index_to_metric = MMapIndexedDataset(index_to_metric_fname,
-                                                     skip_warmup=True)
+                index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True)
+                index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True)
                 index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged"
-                index_to_sample_merged_builder = create_mmap_dataset_builder(
-                    index_to_sample_merged_fname,
-                    sample_idx_dtype)
+                index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname,
+                                                                             sample_idx_dtype)
                 for v_idx in range(len(index_to_sample)):
                     if v_idx > 0:
                         assert index_to_metric[v_idx] > index_to_metric[v_idx - 1]
-                    num_sample_per_value[index_to_metric[v_idx][0]] = len(
-                        index_to_sample[v_idx])
+                    num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx])
                 assert sum(num_sample_per_value.values()) == total_num_samples
                 merge_step = len(index_to_sample) // 100
                 for v_idx in range(0, len(index_to_sample), merge_step):
                     merged_samples = np.copy(
-                        np.concatenate(
-                            index_to_sample[v_idx:min(len(index_to_sample),
-                                                      (v_idx + merge_step))],
-                            axis=None))
+                        np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))],
+                                       axis=None))
                     index_to_sample_merged_builder.add_item(
-                        torch.tensor(merged_samples.astype(np.int64),
-                                     dtype=torch.long))
-                    logger.info(
-                        f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}."
-                    )
-                close_mmap_dataset_builder(index_to_sample_merged_builder,
-                                           index_to_sample_merged_fname)
-                self.get_metric_value_percentiles(metric_name,
-                                                  num_sample_per_value,
-                                                  total_num_samples)
+                        torch.tensor(merged_samples.astype(np.int64), dtype=torch.long))
+                    logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.")
+                close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname)
+                self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples)
             elif metric_type == 'accumulate_value_over_samples':
                 metric_save_path = f"{save_path}/{metric_name}/"
                 metric_value = None
@@ -500,8 +395,7 @@ class DataAnalyzer(object):
                     for t_idx in range(num_threads):
                         w_metric_save_path = f"{metric_save_path}/worker{w_idx}_thread{t_idx}/"
                         w_metric_value_fname = f"{w_metric_save_path}/{metric_name}_metric_value"
-                        w_metric_value = MMapIndexedDataset(w_metric_value_fname,
-                                                            skip_warmup=True)
+                        w_metric_value = MMapIndexedDataset(w_metric_value_fname, skip_warmup=True)
                         if metric_value is None:
                             metric_value = np.copy(w_metric_value[0])
                         else:
@@ -510,28 +404,14 @@ class DataAnalyzer(object):
                 value_min = int(min(metric_value))
                 metric_value_dtype = find_fit_int_dtype(value_min, value_max)
                 metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
-                metric_value_builder = create_mmap_dataset_builder(
-                    metric_value_fname,
-                    metric_value_dtype)
-                metric_value_builder.add_item(
-                    torch.tensor(metric_value.astype(np.int64),
-                                 dtype=torch.long))
+                metric_value_builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype)
+                metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long))
                 close_mmap_dataset_builder(metric_value_builder, metric_value_fname)
 
     def run_reduce(self):
         if self.custom_reduce is None:
-            self.merge_map_results(self.dataset,
-                                   self.metric_names,
-                                   self.metric_types,
-                                   self.save_path,
-                                   self.num_workers,
-                                   self.num_threads,
-                                   self.num_threads_reduce)
+            self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path,
+                                   self.num_workers, self.num_threads, self.num_threads_reduce)
         else:
-            self.custom_reduce(self.dataset,
-                               self.metric_names,
-                               self.metric_types,
-                               self.save_path,
-                               self.num_workers,
-                               self.num_threads,
-                               self.num_threads_reduce)
+            self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers,
+                               self.num_threads, self.num_threads_reduce)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
index 24dbcfa93a0beac651dda165ac2c823c20eb5a05..ef845e4bc4907f746faa06665cf752088c111e8c 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
@@ -1,21 +1,24 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+coding=utf-8
+ Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
 Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/data_samplers.py
-'''
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
 
 import torch
 import os
@@ -31,6 +34,7 @@ from .utils import create_mmap_dataset_builder, close_mmap_dataset_builder, find
 
 
 class DeepSpeedDataSampler(object):
+
     def __init__(self,
                  data_efficiency_config,
                  one_epoch_total_samples,
@@ -45,8 +49,8 @@ class DeepSpeedDataSampler(object):
         self.data_efficiency_config = data_efficiency_config
         self.one_epoch_total_samples = one_epoch_total_samples
         self.index_dtype = find_fit_int_dtype(0, one_epoch_total_samples)
-        self.total_samples = one_epoch_total_samples * self.data_efficiency_config[
-            DATA_SAMPLING][DATA_SAMPLING_NUM_EPOCHS]
+        self.total_samples = one_epoch_total_samples * self.data_efficiency_config[DATA_SAMPLING][
+            DATA_SAMPLING_NUM_EPOCHS]
         self.micro_batch_size = micro_batch_size
         self.data_parallel_rank = data_parallel_rank
         self.data_parallel_group = data_parallel_group
@@ -57,13 +61,11 @@ class DeepSpeedDataSampler(object):
             self.gradient_accumulation_steps
         self.global_rank = global_rank
         self.drop_last = drop_last
-        self.np_rng = np.random.default_rng(
-            self.data_efficiency_config[DATA_EFFICIENCY_SEED])
+        self.np_rng = np.random.default_rng(self.data_efficiency_config[DATA_EFFICIENCY_SEED])
         self.state = {}
         self.batch = []
         self.consumed_samples = 0
-        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-                CURRICULUM_LEARNING_ENABLED]:
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]:
             self.curriculum_step = 0
             self.current_difficulties = {}
             self.data_cluster_paths = []
@@ -77,33 +79,26 @@ class DeepSpeedDataSampler(object):
             if self.global_rank == 0:
                 self.data_clusters = []
                 self.data_cluster_sizes = []
-                cluster_path = self.data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+                cluster_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_CLUSTER_PATH]
                 if not os.path.exists(cluster_path):
                     os.makedirs(cluster_path)
-            for metric in self.data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]:
+            for metric in self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]:
                 self.curriculum_schedulers[metric] = CurriculumScheduler(
-                    data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
-                    [CURRICULUM_LEARNING_METRICS][metric])
-                self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
-                        CURRICULUM_LEARNING_DIFFICULTY_TYPE]
-                self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][
-                    CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric][
-                        CURRICULUM_LEARNING_CLUSTERING_TYPE]
+                    data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][metric])
+                self.difficulty_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_DIFFICULTY_TYPE]
+                self.clustering_type[metric] = data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+                    CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_CLUSTERING_TYPE]
                 if self.global_rank == 0:
                     if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
                         self.curriculum_index_to_sample[metric] = MMapIndexedDataset(
-                            data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
-                            [CURRICULUM_LEARNING_METRICS][metric]
-                            [CURRICULUM_LEARNING_SAMPLE_PATH],
+                            data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
+                            [metric][CURRICULUM_LEARNING_SAMPLE_PATH],
                             skip_warmup=True)
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
                             self.curriculum_index_to_metric[metric] = MMapIndexedDataset(
-                                data_efficiency_config[DATA_SAMPLING]
-                                [CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
+                                data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS]
                                 [metric][CURRICULUM_LEARNING_METRIC_PATH],
                                 skip_warmup=True)
 
@@ -122,8 +117,7 @@ class DeepSpeedDataSampler(object):
     def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
         for metric in self.curriculum_schedulers:
             if metric in schedule_func_dict:
-                self.curriculum_schedulers[metric].set_custom_get_difficulty(
-                    schedule_func_dict[metric])
+                self.curriculum_schedulers[metric].set_custom_get_difficulty(schedule_func_dict[metric])
 
     def get_start_end_idx(self):
         start_idx = self.data_parallel_rank * self.micro_batch_size
@@ -133,26 +127,19 @@ class DeepSpeedDataSampler(object):
     def get_sample_based_on_metric_value(self, metric, value_start, value_end):
         new_samples = None
         for row in range(len(self.curriculum_index_to_sample[metric])):
-            if self.curriculum_index_to_metric[metric][
-                    row] <= value_end and self.curriculum_index_to_metric[metric][
-                        row] > value_start:
+            if self.curriculum_index_to_metric[metric][row] <= value_end and self.curriculum_index_to_metric[metric][
+                    row] > value_start:
                 row_samples = np.copy(self.curriculum_index_to_sample[metric][row])
                 new_samples = row_samples if new_samples is None else np.concatenate(
-                    (new_samples,
-                     row_samples),
-                    axis=None)
+                    (new_samples, row_samples), axis=None)
         return new_samples
 
-    def get_sample_based_on_metric_percentile(self,
-                                              metric,
-                                              percentile_start,
-                                              percentile_end):
+    def get_sample_based_on_metric_percentile(self, metric, percentile_start, percentile_end):
         new_samples = None
         if self.data_1epoch_size is None:
-            self.data_1epoch_size = sum(
-                len(x) for x in self.curriculum_index_to_sample[metric])
-        max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-            CURRICULUM_LEARNING_METRICS][metric][CURRICULUM_LEARNING_MAX_DIFFICULTY]
+            self.data_1epoch_size = sum(len(x) for x in self.curriculum_index_to_sample[metric])
+        max_percentile = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_METRICS][
+            metric][CURRICULUM_LEARNING_MAX_DIFFICULTY]
         sample_per_percentile = self.data_1epoch_size // max_percentile
         start_count = sample_per_percentile * percentile_start
         end_count = sample_per_percentile * percentile_end
@@ -167,12 +154,9 @@ class DeepSpeedDataSampler(object):
                     row_end = row_size
                 else:
                     row_end = end_count - current_count
-                row_samples = np.copy(
-                    self.curriculum_index_to_sample[metric][row][row_start:row_end])
+                row_samples = np.copy(self.curriculum_index_to_sample[metric][row][row_start:row_end])
                 new_samples = row_samples if new_samples is None else np.concatenate(
-                    (new_samples,
-                     row_samples),
-                    axis=None)
+                    (new_samples, row_samples), axis=None)
             current_count += row_size
             if current_count >= end_count:
                 break
@@ -193,63 +177,42 @@ class DeepSpeedDataSampler(object):
                     need_clustering += 1
             if need_clustering > 1:
                 for metric in self.curriculum_schedulers:
-                    if self.clustering_type[
-                            metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER:
+                    if self.clustering_type[metric] == CURRICULUM_LEARNING_SINGLE_CLUSTER:
                         metric_cluster = np.arange(start=0,
                                                    stop=self.one_epoch_total_samples,
                                                    step=1,
                                                    dtype=self.index_dtype)
                     else:
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
-                            metric_cluster = self.get_sample_based_on_metric_value(
-                                metric,
-                                float('-inf'),
-                                self.current_difficulties[metric])
-                        elif self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            metric_cluster = self.get_sample_based_on_metric_value(metric, float('-inf'),
+                                                                                   self.current_difficulties[metric])
+                        elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                             metric_cluster = self.get_sample_based_on_metric_percentile(
-                                metric,
-                                0,
-                                self.current_difficulties[metric])
+                                metric, 0, self.current_difficulties[metric])
                     new_cluster = metric_cluster if new_cluster is None else \
                         np.intersect1d(new_cluster, metric_cluster, assume_unique=True)
                 for cluster in self.data_clusters:
-                    new_cluster = np.setdiff1d(new_cluster,
-                                               cluster[0],
-                                               assume_unique=True)
+                    new_cluster = np.setdiff1d(new_cluster, cluster[0], assume_unique=True)
             else:
                 if len(self.data_clusters) == 0:
-                    new_cluster = np.arange(start=0,
-                                            stop=self.one_epoch_total_samples,
-                                            step=1,
-                                            dtype=self.index_dtype)
+                    new_cluster = np.arange(start=0, stop=self.one_epoch_total_samples, step=1, dtype=self.index_dtype)
                 for metric in self.curriculum_schedulers:
                     if self.clustering_type[metric] != CURRICULUM_LEARNING_SINGLE_CLUSTER:
-                        if self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_VALUE_BASED:
-                            new_cluster = self.get_sample_based_on_metric_value(
-                                metric,
-                                previous_difficulties[metric],
-                                self.current_difficulties[metric])
-                        elif self.difficulty_type[
-                                metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                        if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
+                            new_cluster = self.get_sample_based_on_metric_value(metric, previous_difficulties[metric],
+                                                                                self.current_difficulties[metric])
+                        elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                             new_cluster = self.get_sample_based_on_metric_percentile(
-                                metric,
-                                previous_difficulties[metric],
-                                self.current_difficulties[metric])
+                                metric, previous_difficulties[metric], self.current_difficulties[metric])
             if new_cluster is not None and len(new_cluster) > 0:
                 logger.info(
                     f"new data cluster (previous_difficulties {previous_difficulties}, current_difficulties {self.current_difficulties}) with size {len(new_cluster)} generated."
                 )
                 self.np_rng.shuffle(new_cluster)
-                cluster_builder = create_mmap_dataset_builder(cluster_path,
-                                                              self.index_dtype)
+                cluster_builder = create_mmap_dataset_builder(cluster_path, self.index_dtype)
                 cluster_builder.add_item_numpy(new_cluster)
                 close_mmap_dataset_builder(cluster_builder, cluster_path)
-                self.data_clusters.append(
-                    MMapIndexedDataset(cluster_path,
-                                       skip_warmup=True))
+                self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True))
                 self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
             else:
                 logger.info(
@@ -264,10 +227,7 @@ class DeepSpeedDataSampler(object):
         num_clusters = len(self.data_clusters)
         weight_sum = sum(self.data_cluster_sizes)
         weights = [x / weight_sum for x in self.data_cluster_sizes]
-        samples = self.np_rng.choice(num_clusters,
-                                     self.global_batch_size,
-                                     replace=True,
-                                     p=weights)
+        samples = self.np_rng.choice(num_clusters, self.global_batch_size, replace=True, p=weights)
         samples = np.bincount(samples, minlength=num_clusters)
         return samples
 
@@ -285,8 +245,7 @@ class DeepSpeedDataSampler(object):
 
     def get_sample_from_cluster(self, cidx, num_samples):
         start_idx = self.data_cluster_current_position[cidx]
-        samples = list(
-            np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)]))
+        samples = list(np.copy(self.data_clusters[cidx][0][start_idx:(start_idx + num_samples)]))
         self.data_cluster_current_position[cidx] += num_samples
         if len(samples) < num_samples:
             num_samples_remained = num_samples - len(samples)
@@ -297,14 +256,12 @@ class DeepSpeedDataSampler(object):
         return samples
 
     def get_next_global_batch(self):
-        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-                CURRICULUM_LEARNING_ENABLED]:
+        if self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]:
             self.curriculum_step += 1
             new_cluster = False
             previous_difficulties = {}
             for metric in self.curriculum_schedulers:
-                next_difficulty = self.curriculum_schedulers[metric].update_difficulty(
-                    self.curriculum_step)
+                next_difficulty = self.curriculum_schedulers[metric].update_difficulty(self.curriculum_step)
                 if metric not in self.current_difficulties or \
                     next_difficulty != self.current_difficulties[metric]:
                     new_cluster = True
@@ -313,8 +270,7 @@ class DeepSpeedDataSampler(object):
                 else:
                     if self.difficulty_type[metric] == CURRICULUM_LEARNING_VALUE_BASED:
                         previous_difficulties[metric] = float('-inf')
-                    elif self.difficulty_type[
-                            metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
+                    elif self.difficulty_type[metric] == CURRICULUM_LEARNING_PERCENTILE_BASED:
                         previous_difficulties[metric] = 0
                 self.current_difficulties[metric] = next_difficulty
             if new_cluster:
@@ -323,12 +279,9 @@ class DeepSpeedDataSampler(object):
                 samples_per_cluster = self.sample_from_clusters()
                 batch = []
                 for cidx in range(len(samples_per_cluster)):
-                    batch += self.get_sample_from_cluster(cidx,
-                                                          samples_per_cluster[cidx])
+                    batch += self.get_sample_from_cluster(cidx, samples_per_cluster[cidx])
                 self.np_rng.shuffle(batch)
-                batch = torch.tensor(batch,
-                                     device=get_accelerator().current_device_name(),
-                                     dtype=torch.long).view(-1)
+                batch = torch.tensor(batch, device=get_accelerator().current_device_name(), dtype=torch.long).view(-1)
             else:
                 batch = torch.empty(self.global_batch_size,
                                     device=get_accelerator().current_device_name(),
@@ -356,8 +309,7 @@ class DeepSpeedDataSampler(object):
             CURRICULUM_LEARNING_STEP: self.curriculum_step,
             CURRICULUM_LEARNING_CURRENT_DIFFICULTIES: self.current_difficulties,
             CURRICULUM_LEARNING_DATA_CLUSTER_PATHS: self.data_cluster_paths,
-            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION:
-            self.data_cluster_current_position,
+            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION: self.data_cluster_current_position,
             CURRICULUM_LEARNING_NP_RNG_STATE: np.random.get_state()
         }
 
@@ -367,11 +319,10 @@ class DeepSpeedDataSampler(object):
         self.curriculum_step = state_dict[CURRICULUM_LEARNING_STEP]
         self.current_difficulties = state_dict[CURRICULUM_LEARNING_CURRENT_DIFFICULTIES]
         self.data_cluster_paths = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_PATHS]
-        self.data_cluster_current_position = state_dict[
-            CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION]
+        self.data_cluster_current_position = state_dict[CURRICULUM_LEARNING_DATA_CLUSTER_CURRENT_POSITION]
         np.random.set_state(state_dict[CURRICULUM_LEARNING_NP_RNG_STATE])
-        cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][
-            CURRICULUM_LEARNING][CURRICULUM_LEARNING_CLUSTER_PATH]
+        cluster_root_path = self.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
+            CURRICULUM_LEARNING_CLUSTER_PATH]
         # Backward compatibility: previously data_cluster_paths were stored as
         # absolute paths. Now we changed it to just the file name so that even
         # if user moved the cluster files, the checkpoint loading still works
@@ -379,12 +330,9 @@ class DeepSpeedDataSampler(object):
         # in deepspeed json config.
         for idx in range(len(self.data_cluster_paths)):
             if '/' in self.data_cluster_paths[idx]:
-                self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split(
-                    '/')[-1]
+                self.data_cluster_paths[idx] = self.data_cluster_paths[idx].split('/')[-1]
         if self.global_rank == 0:
             for cluster_fname in self.data_cluster_paths:
                 cluster_path = f"{cluster_root_path}/{cluster_fname}"
-                self.data_clusters.append(
-                    MMapIndexedDataset(cluster_path,
-                                       skip_warmup=True))
+                self.data_clusters.append(MMapIndexedDataset(cluster_path, skip_warmup=True))
                 self.data_cluster_sizes.append(len(self.data_clusters[-1][0]))
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
index b2a31663e923df62404f879fa6287075dadbd23d..60115fa6efef9ea20b4068607eedd36869e9a954 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
@@ -1,7 +1,10 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
 Part of this code was adopted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py
-'''
+"""
 
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
@@ -50,16 +53,13 @@ def infer_dataset_impl(path):
                 return None
     else:
         print(f"Dataset does not exist: {path}")
-        print(
-            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
-        )
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
         return None
 
 
 def make_builder(out_file, impl, vocab_size=None):
     if impl == 'mmap':
-        return MMapIndexedDatasetBuilder(out_file,
-                                         dtype=__best_fitting_dtype(vocab_size))
+        return MMapIndexedDatasetBuilder(out_file, dtype=__best_fitting_dtype(vocab_size))
     else:
         return IndexedDatasetBuilder(out_file)
 
@@ -67,9 +67,7 @@ def make_builder(out_file, impl, vocab_size=None):
 def make_dataset(path, impl, skip_warmup=False):
     if not IndexedDataset.exists(path):
         print(f"Dataset does not exist: {path}")
-        print(
-            "Path should be a basename that both .idx and .bin can be appended to get full filenames."
-        )
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
         return None
     if impl == 'infer':
         impl = infer_dataset_impl(path)
@@ -150,10 +148,8 @@ class IndexedDataset(torch.utils.data.Dataset):
     def read_index(self, path):
         with open(index_file_path(path), 'rb') as f:
             magic = f.read(8)
-            assert magic == self._HDR_MAGIC, (
-                'Index file doesn\'t match expected format. '
-                'Make sure that --dataset-impl is configured properly.'
-            )
+            assert magic == self._HDR_MAGIC, ('Index file doesn\'t match expected format. '
+                                              'Make sure that --dataset-impl is configured properly.')
             version = f.read(8)
             assert struct.unpack('<Q', version) == (1, )
             code, self.element_size = struct.unpack('<QQ', f.read(16))
@@ -212,8 +208,7 @@ class IndexedDataset(torch.utils.data.Dataset):
 
     @staticmethod
     def exists(path):
-        return (os.path.exists(index_file_path(path))
-                and os.path.exists(data_file_path(path)))
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
     @property
     def supports_prefetch(self):
@@ -221,6 +216,7 @@ class IndexedDataset(torch.utils.data.Dataset):
 
 
 class IndexedCachedDataset(IndexedDataset):
+
     def __init__(self, path):
         super().__init__(path)
         self.cache = None
@@ -273,15 +269,7 @@ class IndexedCachedDataset(IndexedDataset):
 
 
 class IndexedDatasetBuilder(object):
-    element_sizes = {
-        np.uint8: 1,
-        np.int8: 1,
-        np.int16: 2,
-        np.int32: 4,
-        np.int64: 8,
-        np.float64: 4,
-        np.double: 8
-    }
+    element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, np.float64: 4, np.double: 8}
 
     def __init__(self, out_file, dtype=np.int32):
         self.out_file = open(out_file, 'wb')
@@ -379,12 +367,15 @@ def get_pointers_with_total(sizes, elemsize, dtype):
 
 
 class MMapIndexedDataset(torch.utils.data.Dataset):
+
     class Index(object):
         _HDR_MAGIC = b'MMIDIDX\x00\x00'
 
         @classmethod
         def writer(cls, path, dtype):
+
             class _Writer(object):
+
                 def __enter__(self):
                     self._file = open(path, 'wb')
 
@@ -430,10 +421,8 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
         def __init__(self, path, skip_warmup=False):
             with open(path, 'rb') as stream:
                 magic_test = stream.read(9)
-                assert self._HDR_MAGIC == magic_test, (
-                    'Index file doesn\'t match expected format. '
-                    'Make sure that --dataset-impl is configured properly.'
-                )
+                assert self._HDR_MAGIC == magic_test, ('Index file doesn\'t match expected format. '
+                                                       'Make sure that --dataset-impl is configured properly.')
                 version = struct.unpack('<Q', stream.read(8))
                 assert (1, ) == version
 
@@ -452,10 +441,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
             self._bin_buffer = memoryview(self._bin_buffer_mmap)
             print("    reading sizes...")
-            self._sizes = np.frombuffer(self._bin_buffer,
-                                        dtype=np.int32,
-                                        count=self._len,
-                                        offset=offset)
+            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
             print("    reading pointers...")
             self._pointers = np.frombuffer(self._bin_buffer,
                                            dtype=np.int64,
@@ -465,8 +451,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             self._doc_idx = np.frombuffer(self._bin_buffer,
                                           dtype=np.int64,
                                           count=self._doc_count,
-                                          offset=offset + self._sizes.nbytes +
-                                          self._pointers.nbytes)
+                                          offset=offset + self._sizes.nbytes + self._pointers.nbytes)
 
         def __del__(self):
             self._bin_buffer_mmap._mmap.close()
@@ -514,9 +499,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             print("    warming up data mmap file...")
             _warmup_mmap_file(data_file_path(self._path))
         print("    creating numpy buffer of mmap...")
-        self._bin_buffer_mmap = np.memmap(data_file_path(self._path),
-                                          mode='r',
-                                          order='C')
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode='r', order='C')
         print("    creating memory view of numpy buffer...")
         self._bin_buffer = memoryview(self._bin_buffer_mmap)
 
@@ -532,10 +515,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
     def __getitem__(self, idx):
         if isinstance(idx, int):
             ptr, size = self._index[idx]
-            np_array = np.frombuffer(self._bin_buffer,
-                                     dtype=self._index.dtype,
-                                     count=size,
-                                     offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
             return np_array
         elif isinstance(idx, slice):
             start, stop, step = idx.indices(len(self))
@@ -545,10 +525,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
             sizes = self._index._sizes[idx]
             offsets = list(accumulate(sizes))
             total_size = sum(sizes)
-            np_array = np.frombuffer(self._bin_buffer,
-                                     dtype=self._index.dtype,
-                                     count=total_size,
-                                     offset=ptr)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
             sents = np.split(np_array, offsets[:-1])
             return sents
 
@@ -562,10 +539,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
         if length is None:
             length = size - offset
         ptr += offset * np.dtype(self._index.dtype).itemsize
-        np_array = np.frombuffer(self._bin_buffer,
-                                 dtype=self._index.dtype,
-                                 count=length,
-                                 offset=ptr)
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
         return np_array
 
     @property
@@ -591,8 +565,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
 
     @staticmethod
     def exists(path):
-        return (os.path.exists(index_file_path(path))
-                and os.path.exists(data_file_path(path)))
+        return (os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path)))
 
     @property
     def dtype(self):
@@ -600,6 +573,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
 
 
 class MMapIndexedDatasetBuilder(object):
+
     def __init__(self, out_file, dtype=np.int64):
         self._data_file = open(out_file, 'wb')
         self._dtype = dtype
@@ -626,9 +600,7 @@ class MMapIndexedDatasetBuilder(object):
         assert index.dtype == self._dtype
 
         total_len = len(index.sizes) + len(self._sizes)
-        print(
-            f"    concat {another_file} size={len(index.sizes)} for a total size of {total_len}"
-        )
+        print(f"    concat {another_file} size={len(index.sizes)} for a total size of {total_len}")
 
         offset = len(self._sizes)
         self._sizes.extend(index.sizes)
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
index 7b1a3fb5945b7e8f88c9153ec825b6393cd0c78b..9c643f3705de25a97800f5c936ebacb6a96c12b9 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import numpy as np
@@ -32,19 +33,14 @@ def find_fit_int_dtype(min_value, max_value):
 
 def split_index(start_idx, end_idx, num_partitions):
     partition_size = math.ceil((end_idx - start_idx) / num_partitions)
-    partitions = [[
-        start_idx + x * partition_size,
-        min(end_idx,
-            start_idx + (x + 1) * partition_size)
-    ] for x in range(num_partitions)]
+    partitions = [[start_idx + x * partition_size,
+                   min(end_idx, start_idx + (x + 1) * partition_size)] for x in range(num_partitions)]
     return partitions
 
 
 def split_dataset(dataset, num_workers, worker_id, num_threads):
     worker_splits = split_index(0, len(dataset), num_workers)
-    thread_splits = split_index(worker_splits[worker_id][0],
-                                worker_splits[worker_id][1],
-                                num_threads)
+    thread_splits = split_index(worker_splits[worker_id][0], worker_splits[worker_id][1], num_threads)
     return worker_splits, thread_splits
 
 
diff --git a/deepspeed/runtime/dataloader.py b/deepspeed/runtime/dataloader.py
index 3734eedeb5c907a902d050d72fa88817d8bab50d..499473b4ced81ba5ae5a447e32e03a2971b63c8f 100644
--- a/deepspeed/runtime/dataloader.py
+++ b/deepspeed/runtime/dataloader.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from torch.utils.data import DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -14,6 +15,7 @@ from deepspeed.runtime.constants import GRADIENT_ACCUMULATION_STEPS, \
 
 
 class RepeatingLoader:
+
     def __init__(self, loader):
         """Wraps an iterator to allow for infinite iteration. This is especially useful
         for DataLoader types that we wish to automatically restart upon completion.
@@ -37,6 +39,7 @@ class RepeatingLoader:
 
 
 class DeepSpeedDataLoader(object):
+
     def __init__(self,
                  dataset,
                  batch_size,
@@ -55,30 +58,26 @@ class DeepSpeedDataLoader(object):
         self.batch_size = batch_size
         self.curriculum_learning_enabled = False
         if CURRICULUM_LEARNING in deepspeed_dataloader_config:
-            self.curriculum_learning_enabled = deepspeed_dataloader_config[
-                CURRICULUM_LEARNING]
+            self.curriculum_learning_enabled = deepspeed_dataloader_config[CURRICULUM_LEARNING]
 
         if self.curriculum_learning_enabled:
-            data_sampler = DeepSpeedDataSampler(
-                self.deepspeed_dataloader_config[DATA_EFFICIENCY],
-                len(dataset),
-                self.batch_size,
-                data_parallel_rank,
-                data_parallel_world_size,
-                self.deepspeed_dataloader_config[DATA_PARALLEL_GROUP],
-                self.deepspeed_dataloader_config[GRADIENT_ACCUMULATION_STEPS],
-                self.deepspeed_dataloader_config[GLOBAL_RANK],
-                drop_last=dataloader_drop_last)
+            data_sampler = DeepSpeedDataSampler(self.deepspeed_dataloader_config[DATA_EFFICIENCY],
+                                                len(dataset),
+                                                self.batch_size,
+                                                data_parallel_rank,
+                                                data_parallel_world_size,
+                                                self.deepspeed_dataloader_config[DATA_PARALLEL_GROUP],
+                                                self.deepspeed_dataloader_config[GRADIENT_ACCUMULATION_STEPS],
+                                                self.deepspeed_dataloader_config[GLOBAL_RANK],
+                                                drop_last=dataloader_drop_last)
             device_count = get_accelerator().device_count()
-            num_local_io_workers = self.deepspeed_dataloader_config[
-                DATA_SAMPLING_NUM_WORKERS]
+            num_local_io_workers = self.deepspeed_dataloader_config[DATA_SAMPLING_NUM_WORKERS]
         else:
             if local_rank >= 0:
                 if data_sampler is None:
-                    data_sampler = DistributedSampler(
-                        dataset=dataset,
-                        num_replicas=data_parallel_world_size,
-                        rank=data_parallel_rank)
+                    data_sampler = DistributedSampler(dataset=dataset,
+                                                      num_replicas=data_parallel_world_size,
+                                                      rank=data_parallel_rank)
                 device_count = 1
             else:
                 if data_sampler is None:
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
index 618ac00caff43f728c421d6b28bddca46dde6bb1..df63854dd1ca05610fca7fc454eb30c46c19d4b3 100755
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.utils import log_dist
@@ -7,6 +10,7 @@ import logging
 
 
 class Eigenvalue(object):
+
     def __init__(self,
                  verbose=False,
                  max_iter=100,
@@ -77,8 +81,7 @@ class Eigenvalue(object):
                 ]
             else:
                 v = [
-                    torch.randn(p.size(),
-                                device=device) for p in model_block.parameters()
+                    torch.randn(p.size(), device=device) for p in model_block.parameters()
                     if p.grad is not None and p.grad.grad_fn is not None
                 ]
             torch.random.set_rng_state(rng_state)
@@ -100,24 +103,18 @@ class Eigenvalue(object):
             # Disable eigenvalue if the model doesn't support second order gradients computation,
             # e.g. when enabling DS transformer kernel.
             if len(grads) == 0 or len(params) == 0:
-                log_dist(f'The model does NOT support eigenvalue computation.',
-                         ranks=[0],
-                         level=logging.WARNING)
+                log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING)
                 return []
 
             i = 0
             eigenvalue_current, eigenvalue_previous = 1., 0.
 
             while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
-                (eigenvalue_current - eigenvalue_previous) /
-                    eigenvalue_current) >= self.tol):  # test convergence criteria
+                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >=
+                                                                           self.tol):  # test convergence criteria
                 eigenvalue_previous = eigenvalue_current
 
-                Hv = torch.autograd.grad(grads,
-                                         params,
-                                         grad_outputs=v,
-                                         only_inputs=True,
-                                         retain_graph=True)
+                Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True)
                 #Hv = [hv.float() for hv in Hv]
                 Hv = [self.nan_to_num(hv).float() for hv in Hv]
 
@@ -131,9 +128,7 @@ class Eigenvalue(object):
             block_eigenvalue.append(eigenvalue_current)
 
             if self.verbose:
-                log_dist(
-                    f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}',
-                    ranks=[0])
+                log_dist(f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', ranks=[0])
 
         block_eigenvalue = self.post_process(block_eigenvalue)
 
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 08cb1fd7276a1dc288881522769020e15f05096a..29223423d2f40e927e1ac976b29ba07d1b0c2008 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import re
@@ -9,6 +10,7 @@ import torch
 import hashlib
 from collections import defaultdict, OrderedDict, deque
 from shutil import copyfile
+import gc
 
 from torch.nn.modules import Module
 from torch.nn.parameter import Parameter
@@ -19,6 +21,7 @@ from typing import Callable, Dict, Union, Iterable
 
 import deepspeed
 
+from deepspeed import comm as dist
 from deepspeed.runtime.utils import see_memory_usage, DummyOptim
 from .zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
@@ -31,7 +34,7 @@ from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
-from deepspeed.runtime.config import DeepSpeedConfig, DEEPSPEED_OPTIMIZERS, \
+from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER
 
@@ -53,7 +56,7 @@ from deepspeed.compression.constants import \
     WEIGHT_QUANTIZE_ROUNDING, \
     WEIGHT_QUANTIZE_VERBOSE, \
     WEIGHT_QUANTIZE_KERNEL
-from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
+from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FROZEN_PARAM_FRAGMENTS
 from deepspeed.runtime.sparse_tensor import SparseTensor
 
 from deepspeed.runtime import lr_schedules
@@ -79,7 +82,7 @@ from deepspeed.runtime.data_pipeline.data_routing.basic_layer import RandomLayer
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 
 from .pipe.module import PipelineModule
-from .utils import ensure_directory_exists, get_ma_status
+from .utils import get_ma_status
 from ..ops.adam import FusedAdam
 from ..moe.sharded_moe import TopKGate, MOELayer
 from ..moe.layer import MoE
@@ -92,10 +95,7 @@ from deepspeed.utils.logging import print_json_dist, print_configuration
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
-from deepspeed.inference.config import DtypeEnum
-
-# Set to torch's distributed package or deepspeed.comm based inside DeepSpeedEngine init
-dist = None
+from deepspeed.runtime.config import DtypeEnum
 
 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
 
@@ -110,16 +110,13 @@ try:
 except ImportError:
     # Fail silently so we don't spam logs unnecessarily if user isn't using amp
     APEX_INSTALLED = False
-    pass
 
 
 def split_half_float_double_sparse(tensors):
     device_type = get_accelerator().device_name()
     supported_types = [
-        "torch.{}.HalfTensor".format(device_type),
-        "torch.{}.FloatTensor".format(device_type),
-        "torch.{}.DoubleTensor".format(device_type),
-        "torch.{}.BFloat16Tensor".format(device_type),
+        "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type),
         SparseTensor.type()
     ]
 
@@ -148,6 +145,7 @@ STEP_GLOBAL_TIMER = 'step'
 
 class EngineTimers(object):
     r"""Wallclock timers for DeepSpeedEngine"""
+
     def __init__(self, enable_micro_timers, enable_global_timers):
         self.forward_timers = []
         self.backward_timers = []
@@ -164,10 +162,7 @@ class EngineTimers(object):
             self.backward_reduce_timers += [BACKWARD_REDUCE_MICRO_TIMER]
             self.step_timers += [STEP_MICRO_TIMER]
             self.micro_timers += [
-                FORWARD_MICRO_TIMER,
-                BACKWARD_MICRO_TIMER,
-                BACKWARD_INNER_MICRO_TIMER,
-                BACKWARD_REDUCE_MICRO_TIMER,
+                FORWARD_MICRO_TIMER, BACKWARD_MICRO_TIMER, BACKWARD_INNER_MICRO_TIMER, BACKWARD_REDUCE_MICRO_TIMER,
                 STEP_MICRO_TIMER
             ]
 
@@ -178,16 +173,14 @@ class EngineTimers(object):
             self.backward_reduce_timers += [BACKWARD_REDUCE_GLOBAL_TIMER]
             self.step_timers += [STEP_GLOBAL_TIMER]
             self.global_timers += [
-                FORWARD_GLOBAL_TIMER,
-                BACKWARD_GLOBAL_TIMER,
-                BACKWARD_INNER_GLOBAL_TIMER,
-                BACKWARD_REDUCE_GLOBAL_TIMER,
+                FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER,
                 STEP_GLOBAL_TIMER
             ]
 
 
 class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training."""
+
     def __init__(
         self,
         args,
@@ -200,7 +193,7 @@ class DeepSpeedEngine(Module):
         dist_init_required=None,
         collate_fn=None,
         config=None,
-        config_params=None,
+        config_class=None,
         dont_change_device=False,
     ):
         super(DeepSpeedEngine, self).__init__()
@@ -218,6 +211,7 @@ class DeepSpeedEngine(Module):
         self.gradient_average = True
         self.warn_unscaled_loss = True
         self.config = config
+        self._config = config_class
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
         self.enable_backward_allreduce = True
@@ -236,8 +230,6 @@ class DeepSpeedEngine(Module):
 
         self.checkpoint_engine = None
 
-        global dist
-        from deepspeed import comm as dist
         self._is_gradient_accumulation_boundary = None
         self.scale_wrt_gas = None
 
@@ -247,38 +239,15 @@ class DeepSpeedEngine(Module):
         # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
         self.param_names = {param: name for name, param in model.named_parameters()}
 
-        # Set config using config_params for backwards compat
-        if self.config is None and config_params is not None:
-            self.config = config_params
-
-        from deepspeed.comm import supported_torch_version
-        # This supported_torch_version check is for torch1.2 compatibility only
-        if supported_torch_version:
-            dist.init_distributed(dist_backend=self.dist_backend,
-                                  dist_init_required=dist_init_required)
-        else:
-            if dist_init_required is None:
-                dist_init_required = not dist.is_initialized()
-
-            if dist_init_required is False:
-                assert (
-                    dist.is_initialized() is True
-                ), "Torch distributed not initialized. Please set dist_init_required to True or initialize before calling deepspeed.initialize()"
-            else:
-                if not dist.is_initialized():
-                    dist.init_process_group(backend=self.dist_backend)
-
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
-        see_memory_usage(f"DeepSpeed Engine: After args sanity test",
-                         force=self.memory_breakdown())
+        see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
         if mpu is not None:
             if self.elasticity_enabled():
                 if not self.is_elastic_model_parallel_supported():
-                    assert not self.elasticity_enabled(), (
-                        "Elasticity is not currently supported" " with model parallelism."
-                    )
+                    assert not self.elasticity_enabled(), ("Elasticity is not currently supported"
+                                                           " with model parallelism.")
 
         self._set_distributed_vars(args)
 
@@ -309,8 +278,7 @@ class DeepSpeedEngine(Module):
             monitor_memory=False,
         )
 
-        log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}",
-                 ranks=[0])
+        log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0])
 
         if self.flops_profiler_enabled():
             self.flops_profiler = FlopsProfiler(self.module, self)
@@ -332,6 +300,10 @@ class DeepSpeedEngine(Module):
         if model_parameters is None:
             model_parameters = self.module.parameters()
 
+        # Convert model parameters from generator to list
+        if not isinstance(model_parameters, list):
+            model_parameters = list(model_parameters)
+
         if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
             self._configure_lr_scheduler(lr_scheduler)
@@ -346,12 +318,9 @@ class DeepSpeedEngine(Module):
         self.sparse_tensor_module_names = set()
         # if self.sparse_gradients_enabled():
         for name, module in self.module.named_modules():
-            if isinstance(module,
-                          (torch.nn.Embedding,
-                           torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled():
+            if isinstance(module, (torch.nn.Embedding, torch.nn.EmbeddingBag)) and self.sparse_gradients_enabled():
                 self.sparse_tensor_module_names.add(name + ".weight")
-                logger.info(
-                    "Will convert {} to sparse tensor during training".format(name))
+                logger.info("Will convert {} to sparse tensor during training".format(name))
 
         self.save_non_zero_checkpoint = False
         self.save_zero_checkpoint = False
@@ -365,23 +334,19 @@ class DeepSpeedEngine(Module):
             self.progressive_layer_drop = self._configure_progressive_layer_drop()
 
         if self.curriculum_enabled_legacy():
-            self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy(
-            )
+            self.curriculum_scheduler_legacy = self._configure_curriculum_scheduler_legacy()
 
         if self.random_ltd_enabled():
             random_ltd_config = self.random_ltd_config()
             random_ltd_config[RANDOM_LTD_GLOBAL_BATCH_SIZE] = self.train_batch_size()
-            random_ltd_config[
-                RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
-            self.random_ltd_scheduler = self._configure_random_ltd_scheduler(
-                random_ltd_config)
+            random_ltd_config[RANDOM_LTD_MICRO_BATCH_SIZE] = self.train_micro_batch_size_per_gpu()
+            self.random_ltd_scheduler = self._configure_random_ltd_scheduler(random_ltd_config)
 
         # Engine timers
 
-        self.engine_timers = EngineTimers(
-            enable_micro_timers=self.wall_clock_breakdown(),
-            enable_global_timers=self.wall_clock_breakdown()
-            or self.flops_profiler_enabled())
+        self.engine_timers = EngineTimers(enable_micro_timers=self.wall_clock_breakdown(),
+                                          enable_global_timers=self.wall_clock_breakdown()
+                                          or self.flops_profiler_enabled())
 
         if self.global_rank == 0:
             self._config.print("DeepSpeedEngine configuration")
@@ -414,10 +379,8 @@ class DeepSpeedEngine(Module):
                 if p.requires_grad:
                     trainable_num_params += n
             if self.global_rank == 0:
-                self.autotuning_model_info[
-                    "num_params"] = num_params * self.mp_world_size
-                self.autotuning_model_info[
-                    "trainable_num_params"] = trainable_num_params * self.mp_world_size
+                self.autotuning_model_info["num_params"] = num_params * self.mp_world_size
+                self.autotuning_model_info["trainable_num_params"] = trainable_num_params * self.mp_world_size
 
             logger.info(f"model parameter = {num_params}")
 
@@ -447,13 +410,10 @@ class DeepSpeedEngine(Module):
             ValueError: if ``train_batch_size`` is not divisible by the
                 configured micro-batch size and data parallelism.
         """
-        if train_batch_size % (self.train_micro_batch_size_per_gpu() *
-                               self.dp_world_size) != 0:
+        if train_batch_size % (self.train_micro_batch_size_per_gpu() * self.dp_world_size) != 0:
             #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
-            raise ValueError(
-                f'Train batch size must be divisible by micro-batch data parallelism')
-        new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() *
-                                       self.dp_world_size)
+            raise ValueError(f'Train batch size must be divisible by micro-batch data parallelism')
+        new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() * self.dp_world_size)
         # overwrite config
         self._config.train_batch_size = train_batch_size
         self._config.gradient_accumulation_steps = new_gas
@@ -464,8 +424,7 @@ class DeepSpeedEngine(Module):
 
     def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
         if self.training_dataloader is not None and self.curriculum_learning_enabled():
-            self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(
-                schedule_func_dict)
+            self.training_dataloader.data_sampler.set_custom_curriculum_learning_schedule(schedule_func_dict)
 
     def get_global_grad_norm(self) -> float:
         """Return the 2-norm of all gradients. If there is model parallelism,
@@ -492,8 +451,7 @@ class DeepSpeedEngine(Module):
         elif name in dir(_module):
             return getattr(_module, name)
         else:
-            raise AttributeError(
-                f"'{type(self).__name__}' object has no attribute '{name}'")
+            raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
     def checkpoint_tag_validation_enabled(self):
         return self._config.checkpoint_tag_validation_enabled
@@ -567,15 +525,13 @@ class DeepSpeedEngine(Module):
         return self._config.data_efficiency_config[DATA_SAMPLING]
 
     def curriculum_learning_enabled(self):
-        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][
-            CURRICULUM_LEARNING_ENABLED]
+        return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING][CURRICULUM_LEARNING_ENABLED]
 
     def curriculum_learning_config(self):
         return self._config.data_efficiency_config[DATA_SAMPLING][CURRICULUM_LEARNING]
 
     def random_ltd_enabled(self):
-        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][
-            RANDOM_LTD_ENABLED]
+        return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD][RANDOM_LTD_ENABLED]
 
     def random_ltd_config(self):
         return self._config.data_efficiency_config[DATA_ROUTING][RANDOM_LTD]
@@ -583,26 +539,20 @@ class DeepSpeedEngine(Module):
     def random_ltd_initialize(self):
         assert self.random_ltd_enabled()
         random_ltd_config = self.random_ltd_config()
-        random_ltd_queue = deque(
-            [x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
+        random_ltd_queue = deque([x for x in sorted(random_ltd_config[RANDOM_LTD_LAYER_ID])])
         count = 0
         for name, layer in self.module.named_modules():
             if isinstance(layer, RandomLayerTokenDrop):
-                if len(random_ltd_queue) != 0 and str(
-                        random_ltd_queue[0]) in name:  ###[1,2,3]
-                    layer.init_config(random_ltd_config,
-                                      self.random_ltd_scheduler,
-                                      count)
+                if len(random_ltd_queue) != 0 and str(random_ltd_queue[0]) in name:  ###[1,2,3]
+                    layer.init_config(random_ltd_config, self.random_ltd_scheduler, count)
                     random_ltd_queue.popleft()
                     count += 1
 
         if random_ltd_config[RANDOM_LTD_LAYER_NUM] != count:
-            raise ValueError(
-                f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
+            raise ValueError(f'random_ltd_layer_num {random_ltd_config[RANDOM_LTD_LAYER_NUM]} must be \
                 equivalent to the len of random_ltd_layer_id {count}')
 
-        if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][
-                RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
+        if random_ltd_config[RANDOM_LTD_LAYER_TOKEN_LR_SCHEDULE][RANDOM_LTD_LAYER_TOKEN_LR_ENABLED]:
             assert self.client_lr_scheduler is None
             raise ValueError(f'not yet support')
             #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
@@ -663,8 +613,7 @@ class DeepSpeedEngine(Module):
     def autotuning_profile_model_info(self):
         return self.autotuning_enabled(
         ) and self._config.autotuning_config.model_info and self._config.autotuning_config.model_info.get(
-            "profile",
-            False)
+            "profile", False)
 
     def sparse_gradients_enabled(self):
         return self._config.sparse_gradients_enabled
@@ -676,8 +625,7 @@ class DeepSpeedEngine(Module):
         return self._config.train_micro_batch_size_per_gpu
 
     def optimizer_name(self):
-        return (self.client_optimizer.__class__.__name__
-                if self.client_optimizer else self._config.optimizer_name)
+        return (self.client_optimizer.__class__.__name__ if self.client_optimizer else self._config.optimizer_name)
 
     def optimizer_params(self):
         return self._config.optimizer_params
@@ -695,22 +643,15 @@ class DeepSpeedEngine(Module):
         return (
             self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
             [WEIGHT_QUANTIZE_IN_FORWARD_ENABLED],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_ENABLED],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_GROUPS],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ENABLED],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_GROUPS],
             self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
             [WEIGHT_QUANTIZE_FP16_MIXED_QUANTIZE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_CHANGE_RATIO],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_TYPE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_ROUNDING],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_VERBOSE],
-            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS]
-            [WEIGHT_QUANTIZE_KERNEL],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_CHANGE_RATIO],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_TYPE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_ROUNDING],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_VERBOSE],
+            self._config.compression_config[WEIGHT_QUANTIZATION][SHARED_PARAMETERS][WEIGHT_QUANTIZE_KERNEL],
         )
 
     def zero_optimization(self):
@@ -719,6 +660,9 @@ class DeepSpeedEngine(Module):
     def zero_allow_untested_optimizer(self):
         return self._config.zero_allow_untested_optimizer
 
+    def zero_force_ds_cpu_optimizer(self):
+        return self._config.zero_force_ds_cpu_optimizer
+
     def zero_reduce_scatter(self):
         return self._config.zero_config.reduce_scatter
 
@@ -733,10 +677,7 @@ class DeepSpeedEngine(Module):
 
     def zero_use_cpu_optimizer(self):
         if self._config.zero_config.offload_optimizer is not None:
-            return self._config.zero_config.offload_optimizer.device in [
-                OffloadDeviceEnum.cpu,
-                OffloadDeviceEnum.nvme
-            ]
+            return self._config.zero_config.offload_optimizer.device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme]
         return False
 
     def zero_cpu_offload(self):
@@ -750,6 +691,9 @@ class DeepSpeedEngine(Module):
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
+    def mics_shard_size(self):
+        return self._config.mics_shard_size
+
     def zero_reduce_bucket_size(self):
         return self._config.zero_config.reduce_bucket_size
 
@@ -833,9 +777,11 @@ class DeepSpeedEngine(Module):
         res = self._config.communication_data_type
         if res is not None:
             return res
-        elif self.fp16_enabled() or self.zero_optimization_stage():
+
+        if self.fp16_enabled():
             return torch.float16
-        elif self.bfloat16_enabled():
+
+        if self.bfloat16_enabled():
             return torch.bfloat16
 
         return torch.float32
@@ -897,14 +843,11 @@ class DeepSpeedEngine(Module):
         # First check for scheduler in json configuration
         lr_scheduler = self._scheduler_from_config(self.optimizer)
         if lr_scheduler:
-            log_dist(
-                f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}",
-                ranks=[0])
+            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
             self.lr_scheduler = lr_scheduler
         else:
             if isinstance(client_lr_scheduler, Callable):
-                log_dist('DeepSpeed using client callable to create LR scheduler',
-                         ranks=[0])
+                log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0])
                 self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
             else:
                 log_dist('DeepSpeed using client LR scheduler', ranks=[0])
@@ -919,12 +862,9 @@ class DeepSpeedEngine(Module):
             try:
                 from deepspeed.runtime.checkpoint_engine.nebula_checkpoint_engine import \
                     NebulaCheckpointEngine
-                self.checkpoint_engine = NebulaCheckpointEngine(
-                    config_params=self._config.nebula_config)
+                self.checkpoint_engine = NebulaCheckpointEngine(config_params=self._config.nebula_config)
             except ImportError as err:
-                logger.error(
-                    f"No torch_nebula was found! Will fall back to torch.save. Details: {err}"
-                )
+                logger.error(f"No torch_nebula was found! Will fall back to torch.save. Details: {err}")
                 self.checkpoint_engine = TorchCheckpointEngine()
 
         dp_rank = self.global_rank
@@ -936,8 +876,7 @@ class DeepSpeedEngine(Module):
         # only the first data parallel process needs to store the model checkpoint
         # if you want to use node local storage this must be done by rank 0 on each
         # node
-        self.save_non_zero_checkpoint = (
-            rank == 0) or self.zero_optimization_partition_weights()
+        self.save_non_zero_checkpoint = (rank == 0) or self.zero_optimization_partition_weights()
 
         if self.zero_optimization() or self.bfloat16_enabled():
             param_rank = dist.get_rank(group=self.optimizer.dp_process_group)
@@ -952,9 +891,8 @@ class DeepSpeedEngine(Module):
             if hasattr(lr_schedules, scheduler_name):
                 scheduler = getattr(lr_schedules, scheduler_name)
             else:
-                assert hasattr(
-                    torch.optim.lr_scheduler, scheduler_name
-                ), f"DeepSpeed does not recognize LR scheduler {scheduler_name}"
+                assert hasattr(torch.optim.lr_scheduler,
+                               scheduler_name), f"DeepSpeed does not recognize LR scheduler {scheduler_name}"
 
                 scheduler = getattr(torch.optim.lr_scheduler, scheduler_name)
 
@@ -965,9 +903,7 @@ class DeepSpeedEngine(Module):
             return None
 
     def _set_distributed_vars(self, args):
-        device_rank = args.device_rank if args is not None and hasattr(
-            args,
-            'device_rank') else self.local_rank
+        device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
         if device_rank >= 0:
             get_accelerator().set_device(device_rank)
             self.device = torch.device(get_accelerator().device_name(), device_rank)
@@ -996,48 +932,23 @@ class DeepSpeedEngine(Module):
         if hasattr(args, 'local_rank'):
             args.local_rank = self.local_rank
 
-        if self.config is None:
-            self.config = (args.deepspeed_config
-                           if hasattr(args,
-                                      "deepspeed_config") else None)
-        self._config = DeepSpeedConfig(self.config, mpu)
-
     # Validate command line arguments
     def _do_args_sanity_check(self, args):
-        if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
-            logger.warning(
-                "************ --deepscale_config is deprecated, please use --deepspeed_config ************"
-            )
-            if hasattr(args, "deepspeed_config"):
-                assert (
-                    args.deepspeed_config is None
-                ), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
-            args.deepspeed_config = args.deepscale_config
-
         assert "LOCAL_RANK" in os.environ or "OMPI_COMM_WORLD_LOCAL_RANK" in os.environ, "DeepSpeed requires the LOCAL_RANK environment " \
             "variable, it is set by the deepspeed launcher, deepspeed.init_distributed, or the torch's launcher. If using a " \
             "different launcher please ensure LOCAL_RANK is set prior to initializing deepspeed."
 
         if hasattr(args, 'local_rank') and args.local_rank != None:
-            assert isinstance(
-                args.local_rank, int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
+            assert isinstance(args.local_rank,
+                              int), f"args.local_rank of {args.local_rank} is an unknown type {type(args.local_rank)}"
             if args.local_rank >= 0:
                 env_local_rank = int(os.environ.get("LOCAL_RANK"))
                 assert (
                     env_local_rank == args.local_rank
                 ), f"Mismatch in local rank setting, args.local_rank={args.local_rank} but env['LOCAL_RANK']={env_local_rank}."
 
-        if self.config is None:
-            assert (
-                hasattr(
-                    args, "deepspeed_config") and args.deepspeed_config is not None
-            ), "DeepSpeed requires --deepspeed_config to specify configuration file"
-
     def _is_supported_optimizer(self, optimizer_name):
-        return (optimizer_name in DEEPSPEED_OPTIMIZERS
-                or getattr(torch.optim,
-                           optimizer_name,
-                           None) is not None)
+        return (optimizer_name in DEEPSPEED_OPTIMIZERS or getattr(torch.optim, optimizer_name, None) is not None)
 
     def _supported_optims(self):
         FairseqOptimizer = None
@@ -1062,18 +973,11 @@ class DeepSpeedEngine(Module):
         if not self.client_optimizer:
             if self.optimizer_name() is not None:
                 assert self._is_supported_optimizer(
-                    self.optimizer_name()
-                ), "{} is not a supported DeepSpeed Optimizer".format(
-                    self.optimizer_name()
-                )
+                    self.optimizer_name()), "{} is not a supported DeepSpeed Optimizer".format(self.optimizer_name())
 
-        if (self.optimizer_name() == LAMB_OPTIMIZER
-                or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER):
-            assert (
-                self.dynamic_loss_scale()
-            ), "DeepSpeed {} optimizer requires dynamic loss scaling".format(
-                self.optimizer_name()
-            )
+        if (self.optimizer_name() == LAMB_OPTIMIZER or self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER):
+            assert (self.dynamic_loss_scale()), "DeepSpeed {} optimizer requires dynamic loss scaling".format(
+                self.optimizer_name())
 
         # Detect invalid combinations of client optimizer and client scheduler
         if isinstance(self.client_lr_scheduler, _LRScheduler):
@@ -1081,6 +985,7 @@ class DeepSpeedEngine(Module):
                 f'Client Optimizer (type = {type(self.client_optimizer)} is not instantiated but Client LR Scheduler is instantiated'
 
     def _broadcast_model(self):
+
         def is_replicated(p):
             if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE:
                 return False
@@ -1095,20 +1000,15 @@ class DeepSpeedEngine(Module):
                                    group=self.expert_data_parallel_group[p.group_name])
             else:
                 if torch.is_tensor(p) and is_replicated(p):
-                    dist.broadcast(p,
-                                   groups._get_broadcast_src_rank(),
-                                   group=self.data_parallel_group)
+                    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.data_parallel_group)
 
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
         return
-        if not all(param.dtype == dtype
-                   for param in model.parameters()) and dist.get_rank() == 0:
-            raise ValueError(
-                f"{dtype} is enabled but the following parameters have dtype that is "
-                f"not {dtype}: "
-                f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}"
-            )
+        if not all(param.dtype == dtype for param in model.parameters()) and dist.get_rank() == 0:
+            raise ValueError(f"{dtype} is enabled but the following parameters have dtype that is "
+                             f"not {dtype}: "
+                             f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}")
 
     def _set_client_model(self, model):
         # register client model in _modules so that nn.module methods work correctly
@@ -1122,14 +1022,12 @@ class DeepSpeedEngine(Module):
 
         if self.fp16_enabled():
             if self.zero_optimization_partition_weights() and any(
-                [hasattr(param,
-                         "ds_id") for param in self.module.parameters()]):
+                [hasattr(param, "ds_id") for param in self.module.parameters()]):
                 self.__check_params(self.module, torch.half)
             self.module.half()
         elif self.bfloat16_enabled():
             if self.zero_optimization_partition_weights() and any(
-                    hasattr(param,
-                            'ds_id') for param in self.module.parameters()):
+                    hasattr(param, 'ds_id') for param in self.module.parameters()):
                 self.__check_params(self.module, torch.bfloat16)
             self.module.bfloat16()
         else:
@@ -1183,8 +1081,7 @@ class DeepSpeedEngine(Module):
                 return [id(param) for param in group]
 
             occurrence = sum([
-                ids_list(group['params']).count(param_id)
-                if param_id in ids_list(group['params']) else 0
+                ids_list(group['params']).count(param_id) if param_id in ids_list(group['params']) else 0
                 for group in optimizer.param_groups
             ])
             assert occurrence <= 1, f"Parameter with name: {name} occurs multiple times in optimizer.param_groups. Make sure it only appears once to prevent undefined behaviour."
@@ -1204,9 +1101,7 @@ class DeepSpeedEngine(Module):
                 ), 'You are using an untested ZeRO Optimizer. Please add <"zero_allow_untested_optimizer": true> in the configuration file to use it.'
 
                 if self.global_rank == 0:
-                    logger.warning(
-                        "**** You are using ZeRO with an untested optimizer, proceed with caution *****"
-                    )
+                    logger.warning("**** You are using ZeRO with an untested optimizer, proceed with caution *****")
 
             if model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32 and self.zero_optimization_stage(
             ) == 1:
@@ -1214,23 +1109,19 @@ class DeepSpeedEngine(Module):
 
             if model_dtype != grad_accum_dtype:
                 raise NotImplementedError(
-                    "Model data type and gradient accumulation data type must be equal to use ZeRO"
-                )
+                    "Model data type and gradient accumulation data type must be equal to use ZeRO")
             return ZERO_OPTIMIZATION
         elif amp_enabled:
             if model_dtype != grad_accum_dtype:
                 raise NotImplementedError(
-                    "Model data type and gradient accumulation data type must be equal to use Amp"
-                )
+                    "Model data type and gradient accumulation data type must be equal to use Amp")
             if model_dtype == torch.bfloat16 or model_dtype == torch.float16:
-                raise NotImplementedError(
-                    "Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
+                raise NotImplementedError("Cannot enable both amp with (legacy) fp16 or bfloat16 mode")
             try:
                 logger.info("Initializing Apex amp from: {}".format(amp.__path__))
             except NameError:
                 # If apex/amp is available it will be imported above
-                raise RuntimeError(
-                    "Unable to import apex/amp, please make sure it is installed")
+                raise RuntimeError("Unable to import apex/amp, please make sure it is installed")
             return AMP
         # data type checks
         elif model_dtype == grad_accum_dtype:
@@ -1244,8 +1135,7 @@ class DeepSpeedEngine(Module):
         elif model_dtype == torch.bfloat16 and grad_accum_dtype == torch.float32:
             return BFLOAT16
         else:
-            raise NotImplementedError(
-                "unsupported mix of model dtype and gradient accummulation type")
+            raise NotImplementedError("unsupported mix of model dtype and gradient accummulation type")
 
         return None
 
@@ -1256,27 +1146,26 @@ class DeepSpeedEngine(Module):
                 client_optimizer.param_groups[:] = [
                     pg for pg in client_optimizer.param_groups if len(pg["params"]) != 0
                 ]
-                log_dist(
-                    "Removing param_group that has no 'params' in the client Optimizer",
-                    ranks=[0])
+                log_dist("Removing param_group that has no 'params' in the client Optimizer", ranks=[0])
 
                 basic_optimizer = client_optimizer
                 log_dist('Using client Optimizer as basic optimizer', ranks=[0])
             else:
                 basic_optimizer = client_optimizer(model_parameters)
                 log_dist('Using client callable to create basic optimizer', ranks=[0])
+
+            if self.zero_use_cpu_optimizer() and not isinstance(basic_optimizer, deepspeed.ops.adam.DeepSpeedCPUAdam):
+                if self.zero_force_ds_cpu_optimizer():
+                    msg = f'You are using ZeRO-Offload with a client provided optimizer ({type(basic_optimizer)}) which in most cases will yield poor performance. Please either use deepspeed.ops.adam.DeepSpeedCPUAdam or set an optimizer in your ds-config (https://www.deepspeed.ai/docs/config-json/#optimizer-parameters). If you really want to use a custom optimizer w. ZeRO-Offload and understand the performance impacts you can also set <"zero_force_ds_cpu_optimizer": false> in your configuration file.'
+                    raise ZeRORuntimeException(msg)
         else:
             basic_optimizer = self._configure_basic_optimizer(model_parameters)
-            log_dist(
-                f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer",
-                ranks=[0])
+            log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0])
 
         self._check_for_duplicates(basic_optimizer)
 
         self.basic_optimizer = basic_optimizer
-        log_dist("DeepSpeed Basic Optimizer = {}".format(
-            basic_optimizer.__class__.__name__),
-                 ranks=[0])
+        log_dist("DeepSpeed Basic Optimizer = {}".format(basic_optimizer.__class__.__name__), ranks=[0])
 
         optimizer_wrapper = self._do_optimizer_sanity_check(basic_optimizer)
 
@@ -1285,9 +1174,7 @@ class DeepSpeedEngine(Module):
         elif optimizer_wrapper == AMP:
             amp_params = self.amp_params()
             log_dist(f"Initializing AMP with these params: {amp_params}", ranks=[0])
-            model, self.optimizer = amp.initialize(
-                self.module, basic_optimizer, **amp_params
-            )
+            model, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
             self._set_client_model(model)
             self._broadcast_model()
             # TODO: maybe need to broadcast experts differently?
@@ -1298,8 +1185,7 @@ class DeepSpeedEngine(Module):
         else:
             self.optimizer = basic_optimizer
 
-        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()),
-                 ranks=[0])
+        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
 
         self.compression_scheduler = self._configure_compression_scheduler()
         self.quantizer = self._configure_quantization()
@@ -1314,32 +1200,24 @@ class DeepSpeedEngine(Module):
                 "'max_grad_norm' is not supported as an optimizer parameter, please switch to using the deepspeed parameter 'gradient_clipping' see: https://www.deepspeed.ai/docs/config-json/#gradient-clipping for more details"
             )
 
-        if self.optimizer_name() in [ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
+        if self.optimizer_name() in [ADAM_OPTIMIZER, ADAMW_OPTIMIZER]:
             torch_adam = optimizer_parameters.pop(TORCH_ADAM_PARAM, False)
             adam_w_mode = optimizer_parameters.pop(ADAM_W_MODE, ADAM_W_MODE_DEFAULT)
 
             # Optimizer name of Adam forces AdamW logic unless adam_w_mode is explicitly set
-            effective_adam_w_mode = self.optimizer_name(
-            ) == ADAMW_OPTIMIZER or adam_w_mode
+            effective_adam_w_mode = self.optimizer_name() == ADAMW_OPTIMIZER or adam_w_mode
 
             if torch_adam:
                 if not effective_adam_w_mode:
-                    optimizer = torch.optim.Adam(model_parameters,
-                                                 **optimizer_parameters)
+                    optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
                 else:
-                    optimizer = torch.optim.AdamW(model_parameters,
-                                                  **optimizer_parameters)
+                    optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters)
             else:
                 if self.zero_use_cpu_optimizer():
-                    if self.optimizer_name() == ADAGRAD_OPTIMIZER:
-                        from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
-                        optimizer = DeepSpeedCPUAdagrad(model_parameters,
-                                                        **optimizer_parameters)
-                    else:
-                        from deepspeed.ops.adam import DeepSpeedCPUAdam
-                        optimizer = DeepSpeedCPUAdam(model_parameters,
-                                                     **optimizer_parameters,
-                                                     adamw_mode=effective_adam_w_mode)
+                    from deepspeed.ops.adam import DeepSpeedCPUAdam
+                    optimizer = DeepSpeedCPUAdam(model_parameters,
+                                                 **optimizer_parameters,
+                                                 adamw_mode=effective_adam_w_mode)
                 else:
                     from deepspeed.ops.adam import FusedAdam
 
@@ -1349,6 +1227,12 @@ class DeepSpeedEngine(Module):
                         adam_w_mode=effective_adam_w_mode,
                     )
 
+        elif self.optimizer_name() == ADAGRAD_OPTIMIZER:
+            if self.zero_use_cpu_optimizer():
+                from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
+                optimizer = DeepSpeedCPUAdagrad(model_parameters, **optimizer_parameters)
+            else:
+                optimizer = torch.optim.Adagrad(model_parameters, **optimizer_parameters)
         elif self.optimizer_name() == LAMB_OPTIMIZER:
             from deepspeed.ops.lamb import FusedLamb
 
@@ -1359,26 +1243,21 @@ class DeepSpeedEngine(Module):
 
             optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f"Currently the convergence of 1-bit Adam is only verified under FP16"
-                )
+                logger.warning(f"Currently the convergence of 1-bit Adam is only verified under FP16")
         elif self.optimizer_name() == ZERO_ONE_ADAM_OPTIMIZER:
             assert not self.zero_optimization(), "0/1 Adam is not compatible with ZeRO"
             from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam
 
             optimizer = ZeroOneAdam(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f'Currently the convergence of 0/1 Adam is only verified under FP16')
+                logger.warning(f'Currently the convergence of 0/1 Adam is only verified under FP16')
         elif self.optimizer_name() == ONEBIT_LAMB_OPTIMIZER:
             assert not self.zero_optimization(), "1bit-Lamb is not compatible with ZeRO"
             from deepspeed.runtime.fp16.onebit.lamb import OnebitLamb
 
             optimizer = OnebitLamb(model_parameters, self, **optimizer_parameters)
             if not self.fp16_enabled():
-                logger.warning(
-                    f"Currently the convergence of 1-bit Lamb is only verified under FP16"
-                )
+                logger.warning(f"Currently the convergence of 1-bit Lamb is only verified under FP16")
         else:
             torch_optimizer = getattr(torch.optim, self.optimizer_name())
             optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -1403,7 +1282,8 @@ class DeepSpeedEngine(Module):
             use_quantizer_kernel,
         ) = self.quantize_training()
         if quantize_enabled and not quantize_weight_in_forward:
-            assert self.fp16_enabled(), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
+            assert self.fp16_enabled(
+            ), "MoQ (quantize in optimization step) weight quantization is only supported for FP16"
         quantizer = None
         if quantize_enabled and not quantize_weight_in_forward:
             from deepspeed.runtime.quantize import Quantizer
@@ -1447,9 +1327,7 @@ class DeepSpeedEngine(Module):
                     has_moe_layers=self.has_moe_layers,
                 )
             else:
-                log_dist(
-                    f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}',
-                    ranks=[0])
+                log_dist(f'Creating fp16 optimizer with static loss scale: {self.loss_scale()}', ranks=[0])
                 optimizer = FP16_Optimizer(
                     optimizer,
                     deepspeed=self,
@@ -1460,8 +1338,7 @@ class DeepSpeedEngine(Module):
                     has_moe_layers=self.has_moe_layers,
                 )
         else:
-            log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale',
-                     ranks=[0])
+            log_dist(f'Creating fp16 unfused optimizer with dynamic loss scale', ranks=[0])
             optimizer = FP16_UnfusedOptimizer(
                 optimizer,
                 deepspeed=self,
@@ -1484,19 +1361,20 @@ class DeepSpeedEngine(Module):
         log_dist('Creating BF16 optimizer', ranks=[0])
 
         timers = self.timers if self.wall_clock_breakdown() else None
-        optimizer = BF16_Optimizer(
-            optimizer,
-            self.param_names,
-            mpu=self.mpu,
-            clip_grad=clip_grad,
-            allgather_bucket_size=self.zero_allgather_bucket_size(),
-            dp_process_group=self.data_parallel_group,
-            timers=timers)
+        optimizer = BF16_Optimizer(optimizer,
+                                   self.param_names,
+                                   mpu=self.mpu,
+                                   clip_grad=clip_grad,
+                                   allgather_bucket_size=self.zero_allgather_bucket_size(),
+                                   dp_process_group=self.data_parallel_group,
+                                   timers=timers)
 
         return optimizer
 
     def _configure_zero_optimizer(self, optimizer):
         zero_stage = self.zero_optimization_stage()
+        mics_shard_size = self.mics_shard_size()
+
         model_dtype, grad_accum_dtype = self.get_data_types()
         timers = self.timers if self.wall_clock_breakdown() else None
 
@@ -1514,8 +1392,7 @@ class DeepSpeedEngine(Module):
             round_robin_gradients = self.zero_round_robin_gradients()
             assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)
 
-            log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
-                     ranks=[0])
+            log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
             # Overlap and contiguous grads are meaningless in stage 1 and are ignored
             if zero_stage == ZeroStageEnum.optimizer_states:
                 overlap_comm = False
@@ -1526,9 +1403,7 @@ class DeepSpeedEngine(Module):
 
             if isinstance(self.module, PipelineModule):
                 if overlap_comm:
-                    logger.warning(
-                        "Pipeline parallelism does not support overlapped communication, will be disabled."
-                    )
+                    logger.warning("Pipeline parallelism does not support overlapped communication, will be disabled.")
                     overlap_comm = False
             optimizer = DeepSpeedZeroOptimizer(
                 optimizer,
@@ -1542,10 +1417,8 @@ class DeepSpeedEngine(Module):
                 reduce_bucket_size=self.zero_reduce_bucket_size(),
                 allgather_bucket_size=self.zero_allgather_bucket_size(),
                 dp_process_group=self.data_parallel_group,
-                expert_parallel_group=self.expert_parallel_group
-                if self.has_moe_layers else None,
-                expert_data_parallel_group=self.expert_data_parallel_group
-                if self.has_moe_layers else None,
+                expert_parallel_group=self.expert_parallel_group if self.has_moe_layers else None,
+                expert_data_parallel_group=self.expert_data_parallel_group if self.has_moe_layers else None,
                 reduce_scatter=self.zero_reduce_scatter(),
                 overlap_comm=overlap_comm,
                 cpu_offload=self.zero_cpu_offload(),
@@ -1557,8 +1430,7 @@ class DeepSpeedEngine(Module):
                 partition_grads=zero_stage == ZeroStageEnum.gradients,
                 round_robin_gradients=round_robin_gradients,
                 has_moe_layers=self.has_moe_layers,
-                fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(
-                ),
+                fp16_master_weights_and_gradients=self.fp16_master_weights_and_gradients(),
                 communication_data_type=self.communication_data_type,
                 elastic_checkpoint=self.zero_elastic_checkpoint())
 
@@ -1566,21 +1438,27 @@ class DeepSpeedEngine(Module):
             assert not self.has_moe_layers, "MoE not supported with Stage 3"
             if isinstance(optimizer, DummyOptim):
                 log_dist("Creating ZeRO Offload", ranks=[0])
-                optimizer = DeepSpeedZeRoOffload(
-                    self.module,
-                    timers=timers,
-                    ds_config=self.config,
-                    overlap_comm=self.zero_overlap_comm(),
-                    prefetch_bucket_size=self.zero_prefetch_bucket_size(),
-                    max_reuse_distance=self.zero_max_reuse_distance(),
-                    max_live_parameters=self.zero_max_live_parameters(),
-                    param_persistence_threshold=self.zero_param_persistence_threshold(),
-                    model_persistence_threshold=self.zero_model_persistence_threshold(),
-                    offload_param_config=self.zero_offload_param(),
-                    mpu=self.mpu)
+                optimizer = DeepSpeedZeRoOffload(self.module,
+                                                 timers=timers,
+                                                 ds_config=self.config,
+                                                 overlap_comm=self.zero_overlap_comm(),
+                                                 prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+                                                 max_reuse_distance=self.zero_max_reuse_distance(),
+                                                 max_live_parameters=self.zero_max_live_parameters(),
+                                                 param_persistence_threshold=self.zero_param_persistence_threshold(),
+                                                 model_persistence_threshold=self.zero_model_persistence_threshold(),
+                                                 offload_param_config=self.zero_offload_param(),
+                                                 mpu=self.mpu)
             else:
-                log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer',
-                         ranks=[0])
+                log_dist(
+                    f'Creating fp16 ZeRO stage {zero_stage} optimizer,'
+                    f' MiCS is enabled {mics_shard_size>0},'
+                    f' Hierarchical params gather {self._config.mics_hierarchial_params_gather}',
+                    ranks=[0])
+                if mics_shard_size > 0:
+                    return self._return_mics_optimizer(optimizer, timers)
+
+                log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
                 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
                 optimizer = DeepSpeedZeroOptimizer_Stage3(
                     self.module,
@@ -1616,6 +1494,37 @@ class DeepSpeedEngine(Module):
 
         return optimizer
 
+    def _return_mics_optimizer(self, basic_optimizer, timers):
+        from deepspeed.runtime.zero.mics import MiCS_Optimizer
+        optimizer = MiCS_Optimizer(self.module,
+                                   basic_optimizer,
+                                   timers=timers,
+                                   ds_config=self.config,
+                                   static_loss_scale=self.loss_scale(),
+                                   dynamic_loss_scale=self.dynamic_loss_scale(),
+                                   dynamic_loss_args=self.dynamic_loss_scale_args(),
+                                   clip_grad=self.gradient_clipping(),
+                                   contiguous_gradients=self.zero_contiguous_gradients(),
+                                   reduce_bucket_size=self.zero_reduce_bucket_size(),
+                                   prefetch_bucket_size=self.zero_prefetch_bucket_size(),
+                                   max_reuse_distance=self.zero_max_reuse_distance(),
+                                   max_live_parameters=self.zero_max_live_parameters(),
+                                   param_persistence_threshold=self.zero_param_persistence_threshold(),
+                                   model_persistence_threshold=self.zero_model_persistence_threshold(),
+                                   dp_process_group=self.data_parallel_group,
+                                   reduce_scatter=self.zero_reduce_scatter(),
+                                   overlap_comm=self.zero_overlap_comm(),
+                                   offload_optimizer_config=self.zero_offload_optimizer(),
+                                   offload_param_config=self.zero_offload_param(),
+                                   sub_group_size=self.zero_sub_group_size(),
+                                   mpu=self.mpu,
+                                   postscale_gradients=self.postscale_gradients(),
+                                   gradient_predivide_factor=self.gradient_predivide_factor(),
+                                   gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                                   aio_config=self.aio_config(),
+                                   communication_data_type=self.communication_data_type)
+        return optimizer
+
     def _configure_eigenvalue(self):
         eigenvalue = Eigenvalue(
             verbose=self.eigenvalue_verbose(),
@@ -1644,9 +1553,7 @@ class DeepSpeedEngine(Module):
 
     @staticmethod
     def is_iterable_style_dataset(obj):
-        return isinstance(obj,
-                          torch.utils.data.IterableDataset
-                          )  # hasattr(obj, "__iter__") should work as well
+        return isinstance(obj, torch.utils.data.IterableDataset)  # hasattr(obj, "__iter__") should work as well
 
     def dataloader_drop_last(self):
         return self._config.dataloader_drop_last
@@ -1669,8 +1576,7 @@ class DeepSpeedEngine(Module):
                      data_sampler=None,
                      collate_fn=None,
                      num_local_io_workers=None):
-        if not (self.is_map_style_dataset(dataset)
-                or self.is_iterable_style_dataset(dataset)):
+        if not (self.is_map_style_dataset(dataset) or self.is_iterable_style_dataset(dataset)):
             raise ValueError("Training data must be a torch Dataset")
 
         if batch_size is None:
@@ -1702,33 +1608,26 @@ class DeepSpeedEngine(Module):
         deepspeed_dataloader_config = {}
         if self.curriculum_learning_enabled():
             deepspeed_dataloader_config = {
-                CURRICULUM_LEARNING:
-                self.curriculum_learning_enabled(),
-                DATA_EFFICIENCY:
-                self.data_efficiency_config(),
-                DATA_PARALLEL_GROUP:
-                self.data_parallel_group,
-                GRADIENT_ACCUMULATION_STEPS:
-                self.gradient_accumulation_steps(),
-                GLOBAL_RANK:
-                self.global_rank,
-                DATA_SAMPLING_NUM_WORKERS:
-                self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
+                CURRICULUM_LEARNING: self.curriculum_learning_enabled(),
+                DATA_EFFICIENCY: self.data_efficiency_config(),
+                DATA_PARALLEL_GROUP: self.data_parallel_group,
+                GRADIENT_ACCUMULATION_STEPS: self.gradient_accumulation_steps(),
+                GLOBAL_RANK: self.global_rank,
+                DATA_SAMPLING_NUM_WORKERS: self.data_sampling_config()[DATA_SAMPLING_NUM_WORKERS]
             }
 
-        return DeepSpeedDataLoader(
-            dataset=dataset,
-            batch_size=batch_size,
-            pin_memory=pin_memory,
-            collate_fn=collate_fn,
-            local_rank=self.local_rank,
-            tput_timer=deepspeed_io_timer,
-            num_local_io_workers=num_local_io_workers,
-            data_sampler=data_sampler,
-            data_parallel_world_size=data_parallel_world_size,
-            data_parallel_rank=data_parallel_rank,
-            dataloader_drop_last=self.dataloader_drop_last(),
-            deepspeed_dataloader_config=deepspeed_dataloader_config)
+        return DeepSpeedDataLoader(dataset=dataset,
+                                   batch_size=batch_size,
+                                   pin_memory=pin_memory,
+                                   collate_fn=collate_fn,
+                                   local_rank=self.local_rank,
+                                   tput_timer=deepspeed_io_timer,
+                                   num_local_io_workers=num_local_io_workers,
+                                   data_sampler=data_sampler,
+                                   data_parallel_world_size=data_parallel_world_size,
+                                   data_parallel_rank=data_parallel_rank,
+                                   dataloader_drop_last=self.dataloader_drop_last(),
+                                   deepspeed_dataloader_config=deepspeed_dataloader_config)
 
     def train(self, mode=True):
         r""""""
@@ -1755,9 +1654,7 @@ class DeepSpeedEngine(Module):
         else:
             scaled_loss = prescaled_loss
             if self.warn_unscaled_loss:
-                logger.warning(
-                    f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}"
-                )
+                logger.warning(f"DeepSpeed unable to scale loss because of type: {type(prescaled_loss)}")
                 self.warn_unscaled_loss = False
 
         return scaled_loss
@@ -1775,9 +1672,8 @@ class DeepSpeedEngine(Module):
         else:
             see_memory_usage("Engine before forward", force=self.memory_breakdown())
 
-        flops_profiler_active = (self.flops_profiler_enabled() and self.global_steps
-                                 == self.flops_profiler_profile_step()
-                                 and self.global_rank == 0)
+        flops_profiler_active = (self.flops_profiler_enabled()
+                                 and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0)
 
         # used to check quantization happens at step 0!
         if self.global_steps == 0 and hasattr(self, "compression_scheduler"):
@@ -1806,10 +1702,7 @@ class DeepSpeedEngine(Module):
             if self.module.training and self.curriculum_enabled_legacy():
                 self.curriculum_scheduler_legacy.update_difficulty(self.global_steps + 1)
                 if self.curriculum_params_legacy()["curriculum_type"] == "seqlen":
-                    kwargs.update({
-                        "curriculum_seqlen":
-                        self.curriculum_scheduler_legacy.get_current_difficulty()
-                    })
+                    kwargs.update({"curriculum_seqlen": self.curriculum_scheduler_legacy.get_current_difficulty()})
 
         if self.module.training and self.random_ltd_enabled():
             self.random_ltd_scheduler.update_seq(self.global_steps)
@@ -1819,7 +1712,6 @@ class DeepSpeedEngine(Module):
             # we are in a forward pass.
             for module in self.module.modules():
                 module._parameters._in_forward = True
-                pass
 
         self._start_timers(self.engine_timers.forward_timers)
 
@@ -1844,9 +1736,7 @@ class DeepSpeedEngine(Module):
         if self.autotuning_profile_model_info():
             activation_mem = get_ma_status() - ma
             self.autotuning_model_info["activation_mem_per_gpu"] = activation_mem
-            print_json_dist(self.autotuning_model_info,
-                            [0],
-                            path=self.autotuning_model_info_path())
+            print_json_dist(self.autotuning_model_info, [0], path=self.autotuning_model_info_path())
             exit()
         else:
             see_memory_usage("Engine after forward", force=self.memory_breakdown())
@@ -1897,27 +1787,21 @@ class DeepSpeedEngine(Module):
             f'allreduce_gradients() is not valid when bfloat+pipeline_parallelism is enabled'
 
         # Pass (PP) gas boundary flag to optimizer (required for zero)
-        self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
-        )
+        self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
         # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well
         if self.zero_optimization_partition_gradients():
             self.optimizer.overlapping_partition_gradients_reduce_epilogue()
 
         # Communicate only at gradient accumulation boundaries
         elif self.is_gradient_accumulation_boundary():
-            if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states:
-                self.optimizer.reduce_gradients(
-                    pipeline_parallel=self.pipeline_parallelism)
+            if self.zero_optimization_stage() == ZeroStageEnum.optimizer_states and hasattr(
+                    self.optimizer, 'reduce_gradients'):
+                self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism)
             else:
                 self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
 
     @instrument_w_nvtx
-    def backward(self,
-                 loss,
-                 allreduce_gradients=True,
-                 release_loss=False,
-                 retain_graph=False,
-                 scale_wrt_gas=True):
+    def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
         r"""Execute backward pass on the loss
         Arguments:
             loss: Torch tensor on which to execute backward propagation
@@ -1932,9 +1816,7 @@ class DeepSpeedEngine(Module):
             scale_wrt_gas = self.scale_wrt_gas
 
         if not allreduce_gradients:
-            logger.warning(
-                f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed"
-            )
+            logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
 
         # scale loss w.r.t. gradient accumulation if needed
         if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
@@ -1959,16 +1841,13 @@ class DeepSpeedEngine(Module):
         self._start_timers(self.engine_timers.backward_inner_timers)
 
         if self.zero_optimization():
-            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary(
-            )
+            self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
             self.optimizer.backward(loss, retain_graph=retain_graph)
         elif self.amp_enabled():
             # AMP requires delaying unscale when inside gradient accumulation boundaries
             # https://nvidia.github.io/apex/advanced.html#gradient-accumulation-across-iterations
             delay_unscale = not self.is_gradient_accumulation_boundary()
-            with amp.scale_loss(loss,
-                                self.optimizer,
-                                delay_unscale=delay_unscale) as scaled_loss:
+            with amp.scale_loss(loss, self.optimizer, delay_unscale=delay_unscale) as scaled_loss:
                 scaled_loss.backward(retain_graph=retain_graph)
         elif self.fp16_enabled():
             if self.eigenvalue_enabled():
@@ -2051,22 +1930,17 @@ class DeepSpeedEngine(Module):
             param.grad = None
 
     def clip_fp32_gradients(self):
-        clip_grad_norm_(parameters=self.module.parameters(),
-                        max_norm=self.gradient_clipping(),
-                        mpu=self.mpu)
+        clip_grad_norm_(parameters=self.module.parameters(), max_norm=self.gradient_clipping(), mpu=self.mpu)
 
     def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         if self.gradient_clipping() > 0.0:
-            if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled()
-                    or self.zero_optimization()):
+            if not (self.fp16_enabled() or self.bfloat16_enabled() or self.amp_enabled() or self.zero_optimization()):
                 self.clip_fp32_gradients()
             elif self.amp_enabled():
                 # AMP's recommended way of doing clipping
                 # https://nvidia.github.io/apex/advanced.html#gradient-clipping
                 master_params = amp.master_params(self.optimizer)
-                clip_grad_norm_(parameters=master_params,
-                                max_norm=self.gradient_clipping(),
-                                mpu=self.mpu)
+                clip_grad_norm_(parameters=master_params, max_norm=self.gradient_clipping(), mpu=self.mpu)
         self.optimizer.step()
 
         if hasattr(self.optimizer, '_global_grad_norm'):
@@ -2087,7 +1961,7 @@ class DeepSpeedEngine(Module):
         # the behaviour that we want
         if self.bfloat16_enabled():
             # TODO: Temporary until bf16_optimizer and zero_optimizer are integrated
-            if self.zero_optimization():
+            if self.zero_optimization() and hasattr(self.optimizer, "zero_grad"):
                 self.optimizer.zero_grad()
             else:
                 pass
@@ -2132,8 +2006,7 @@ class DeepSpeedEngine(Module):
         # Check early because self.global_steps is incremented at some point here.
         # TODO: Delay self.global_steps increment until very end of this function.
         flops_profiler_active = self.flops_profiler_enabled(
-        ) and self.global_steps == self.flops_profiler_profile_step(
-        ) and self.global_rank == 0
+        ) and self.global_steps == self.flops_profiler_profile_step() and self.global_rank == 0
 
         self._start_timers(self.engine_timers.step_timers)
 
@@ -2148,20 +2021,16 @@ class DeepSpeedEngine(Module):
         if self.is_gradient_accumulation_boundary():
             self.gas_boundary_ctr += 1
 
-            if (self.eigenvalue_enabled() and
-                (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
+            if (self.eigenvalue_enabled() and (self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution() == 0)
                     and self.quantizer.any_precision_switch()):
                 log_dist(f"computing eigenvalue...", ranks=[0])
-                self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(
-                    self.module,
-                    self.device,
-                    self.optimizer.cur_scale)
+                self.block_eigenvalue = self.eigenvalue.compute_eigenvalue(self.module, self.device,
+                                                                           self.optimizer.cur_scale)
 
             if self.progressive_layer_drop:
                 self.progressive_layer_drop.update_state(self.global_steps)
 
-            if (self.eigenvalue_enabled() and not self.gas_boundary_ctr %
-                    self.eigenvalue_gas_boundary_resolution()
+            if (self.eigenvalue_enabled() and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()
                     and self.quantizer.any_precision_switch()):
                 self._take_model_step(lr_kwargs, self.block_eigenvalue)
             else:
@@ -2169,8 +2038,7 @@ class DeepSpeedEngine(Module):
 
             report_progress = self.global_rank == 0 if self.global_rank else True
 
-        self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(),
-                             report_speed=report_progress)
+        self.tput_timer.stop(global_step=self.is_gradient_accumulation_boundary(), report_speed=report_progress)
 
         self._stop_timers(self.engine_timers.step_timers)
 
@@ -2178,9 +2046,7 @@ class DeepSpeedEngine(Module):
         if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
-                    self.summary_events = [(f"Train/Samples/lr",
-                                            self.get_lr()[0],
-                                            self.global_samples)]
+                    self.summary_events = [(f"Train/Samples/lr", self.get_lr()[0], self.global_samples)]
 
                     if self.fp16_enabled() and hasattr(self.optimizer, "cur_scale"):
                         self.summary_events.append((
@@ -2189,8 +2055,8 @@ class DeepSpeedEngine(Module):
                             self.global_samples,
                         ))
 
-                    if (self.eigenvalue_enabled() and not self.gas_boundary_ctr %
-                            self.eigenvalue_gas_boundary_resolution()):
+                    if (self.eigenvalue_enabled()
+                            and not self.gas_boundary_ctr % self.eigenvalue_gas_boundary_resolution()):
                         ev_values = self.block_eigenvalue.values()
                         for i in range(len(ev_values)):
                             self.summary_events.append((
@@ -2214,14 +2080,12 @@ class DeepSpeedEngine(Module):
                 )
             self.flops_profiler.end_profile()
 
-        if self.autotuning_enabled() and self.global_steps == (
-                self.autotuning_end_profile_step() + 1):
+        if self.autotuning_enabled() and self.global_steps == (self.autotuning_end_profile_step() + 1):
             self._autotuning_exit()
 
         if self.wall_clock_breakdown():
             # Log micro timing and reset
-            self.timers.log(names=self.engine_timers.micro_timers,
-                            memory_breakdown=self.memory_breakdown())
+            self.timers.log(names=self.engine_timers.micro_timers, memory_breakdown=self.memory_breakdown())
 
         if self.wall_clock_breakdown() or self.flops_profiler_enabled():
             # Log global timing and reset
@@ -2255,13 +2119,10 @@ class DeepSpeedEngine(Module):
                 FORWARD_GLOBAL_TIMER,
                 BACKWARD_GLOBAL_TIMER,
                 STEP_GLOBAL_TIMER,
-            ],
-                                       reset=False)
-            titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[
-                STEP_GLOBAL_TIMER]
+            ], reset=False)
+            titer = msg[FORWARD_GLOBAL_TIMER] + msg[BACKWARD_GLOBAL_TIMER] + msg[STEP_GLOBAL_TIMER]
             msg["latency"] = titer
-            msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps(
-            ) / titer
+            msg["FLOPS_per_gpu"] = self.flops * 1_000_000 * self.gradient_accumulation_steps() / titer
             msg["throughput"] = self.train_batch_size() * 1_000_000 / \
                 msg["latency"]
             print_json_dist(msg, [0], path=self.autotuning_metric_path())
@@ -2335,8 +2196,7 @@ class DeepSpeedEngine(Module):
     def _report_progress(self, step):
         lr = self.get_lr()
         mom = self.get_mom()
-        log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}",
-                 ranks=[0])
+        log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0])
 
     def allreduce_bucket(self, bucket, dp_group):
         tensor = self.flatten(bucket)
@@ -2352,10 +2212,8 @@ class DeepSpeedEngine(Module):
 
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
             if self.gradient_average:
-                if self.gradient_predivide_factor() != dist.get_world_size(
-                        group=dp_group):
-                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() /
-                                             dist.get_world_size(group=dp_group))
+                if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group):
+                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group))
         else:
             tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group))
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
@@ -2397,9 +2255,7 @@ class DeepSpeedEngine(Module):
                 # rank is reducing the same size. In some cases it may make
                 # sense in the future to support the ability to average not
                 # w.r.t. world size but with a different value.
-                param.grad = torch.zeros(param.size(),
-                                         dtype=param.dtype,
-                                         device=param.device)
+                param.grad = torch.zeros(param.size(), dtype=param.dtype, device=param.device)
 
             grad_data = param.grad.data
             if param_name in self.sparse_tensor_module_names or grad_data.is_sparse:
@@ -2426,9 +2282,7 @@ class DeepSpeedEngine(Module):
             if bucket_type == SparseTensor.type():
                 self.sparse_allreduce_no_retain(bucket, dp_group=dp_group)
             else:
-                self.allreduce_no_retain(bucket,
-                                         dp_group=dp_group,
-                                         numel_per_bucket=elements_per_buffer)
+                self.allreduce_no_retain(bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer)
 
     def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
         for ep_name, expert_grads_group in expert_grads.items():
@@ -2436,15 +2290,12 @@ class DeepSpeedEngine(Module):
             for i, bucket_tuple in enumerate(expert_split_buckets):
                 bucket_type, bucket = bucket_tuple
                 if bucket_type == SparseTensor.type():
-                    self.sparse_allreduce_no_retain(
-                        bucket,
-                        groups._get_expert_data_parallel_group(ep_name))
+                    self.sparse_allreduce_no_retain(bucket, groups._get_expert_data_parallel_group(ep_name))
                 else:
                     # Separate between diff groups
-                    self.allreduce_no_retain(
-                        bucket,
-                        dp_group=groups._get_expert_data_parallel_group(ep_name),
-                        numel_per_bucket=elements_per_buffer)
+                    self.allreduce_no_retain(bucket,
+                                             dp_group=groups._get_expert_data_parallel_group(ep_name),
+                                             numel_per_bucket=elements_per_buffer)
 
     def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000):
         if grads is None:
@@ -2487,8 +2338,7 @@ class DeepSpeedEngine(Module):
 
         if self.postscale_gradients():
             if self.gradient_average:
-                values.mul_(self.gradient_predivide_factor() /
-                            dist.get_world_size(group=dp_group))
+                values.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group))
         else:
             values.mul_(1. / dist.get_world_size(group=dp_group))
 
@@ -2509,36 +2359,25 @@ class DeepSpeedEngine(Module):
         if value.dim() == 1:
             if fill_size > 0:
                 value = torch.cat([value, value.new_empty(fill_size)])
-            tensor_list = [
-                value.new_empty(max_size)
-                for _ in range(dist.get_world_size(group=dp_group))
-            ]
+            tensor_list = [value.new_empty(max_size) for _ in range(dist.get_world_size(group=dp_group))]
         else:
             if fill_size > 0:
                 value = torch.cat([value, value.new_empty(fill_size, value.size()[1])])
             tensor_list = [
                 value.new_empty(max_size,
-                                value.size()[1])
-                for _ in range(dist.get_world_size(group=dp_group))
+                                value.size()[1]) for _ in range(dist.get_world_size(group=dp_group))
             ]
 
         dist.all_gather(tensor_list, value, group=dp_group)
         tensors = []
         for dev_idx, t in enumerate(tensor_list):
             size = all_sizes[dev_idx][0]
-            tensors.append(
-                t.index_select(0,
-                               torch.arange(size,
-                                            dtype=torch.long,
-                                            device=self.device)))
+            tensors.append(t.index_select(0, torch.arange(size, dtype=torch.long, device=self.device)))
 
         return tensors
 
     def all_gather_scalar(self, value, dp_group):
-        tensor_list = [
-            value.new_zeros(value.size())
-            for _ in range(dist.get_world_size(group=dp_group))
-        ]
+        tensor_list = [value.new_zeros(value.size()) for _ in range(dist.get_world_size(group=dp_group))]
         dist.all_gather(tensor_list, value, group=dp_group)
         return tensor_list
 
@@ -2558,20 +2397,19 @@ class DeepSpeedEngine(Module):
                             num_experts=1,
                             checkpoint_engine=TorchCheckpointEngine()):
         if old_moe_load:
-            expp_rank = groups._get_expert_data_parallel_rank(
-                groups._get_max_expert_size_name())
+            expp_rank = groups._get_expert_data_parallel_rank(groups._get_max_expert_size_name())
 
-            num_local_experts = max(
-                num_experts) // groups._get_expert_parallel_world_size(
-                    groups._get_max_expert_size_name())
+            num_local_experts = max(num_experts) // groups._get_expert_parallel_world_size(
+                groups._get_max_expert_size_name())
             for local_expert_id in range(num_local_experts):
                 global_expert_id = expp_rank * num_local_experts + local_expert_id
-                expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
-                    checkpoint_path,
-                    -1, # -1 means ignore layer_id
-                    global_expert_id,
-                    tag,
-                    mpu),
+                expert_state_dict = checkpoint_engine.load(
+                    DeepSpeedEngine._get_expert_ckpt_name(
+                        checkpoint_path,
+                        -1,  # -1 means ignore layer_id
+                        global_expert_id,
+                        tag,
+                        mpu),
                     map_location=torch.device('cpu'))
 
                 # Updating global -> local expert ids
@@ -2592,41 +2430,45 @@ class DeepSpeedEngine(Module):
                     # loop all local_experts
                     for local_expert_id in range(num_local_experts):
                         global_expert_id = expp_rank * num_local_experts + local_expert_id
-                        expert_state_dict = checkpoint_engine.load(
-                            DeepSpeedEngine._get_expert_ckpt_name(
-                                checkpoint_path,
-                                moe_layer_id,
-                                global_expert_id,
-                                tag,
-                                mpu),
-                            map_location=torch.device('cpu'))
+                        expert_state_dict = checkpoint_engine.load(DeepSpeedEngine._get_expert_ckpt_name(
+                            checkpoint_path, moe_layer_id, global_expert_id, tag, mpu),
+                                                                   map_location=torch.device('cpu'))
                         # print(expert_state_dict.keys())
                         # Updating global -> local expert ids
                         moe_str_prefix = '.deepspeed_moe.experts.deepspeed_experts.'
                         for key in list(expert_state_dict.keys()):
-                            local_key = key.replace(
-                                f'{moe_str_prefix}{global_expert_id}',
-                                f'{moe_str_prefix}{local_expert_id}')
+                            local_key = key.replace(f'{moe_str_prefix}{global_expert_id}',
+                                                    f'{moe_str_prefix}{local_expert_id}')
                             expert_state_dict[local_key] = expert_state_dict.pop(key)
                         state_dict.update(expert_state_dict)
                     moe_layer_id += 1
 
-    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
+    def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None):
+        module_state_dict = checkpoint['module']
         if custom_load_fn:
-            custom_load_fn(src=state_dict, dst=self.module)
+            custom_load_fn(src=module_state_dict, dst=self.module)
         else:
-            self.module.load_state_dict(state_dict, # TODO
-                                        strict=strict)
+            self.module.load_state_dict(
+                module_state_dict,  # TODO
+                strict=strict)
+
+        if checkpoint.get(FROZEN_PARAM_FRAGMENTS, None) is not None:
+            saved_frozen_params = checkpoint[FROZEN_PARAM_FRAGMENTS]
+            for param in self.module.parameters():
+                if param.requires_grad:
+                    continue
+                if param not in self.param_names:
+                    raise ValueError(f"failed to find frozen {param} in named params")
+                name = self.param_names[param]
+                if hasattr(param, 'ds_id'):
+                    param.ds_tensor.data.copy_(saved_frozen_params[name].data)
+                else:
+                    param.data.copy_(saved_frozen_params[name].data)
 
     def _get_zero_ckpt_prefix(self, dp_rank, bf16_mode):
         return f'{"bf16_" if bf16_mode else ""}zero_pp_rank_{dp_rank}'
 
-    def _get_rank_zero_ckpt_name(self,
-                                 checkpoints_path,
-                                 tag,
-                                 mp_rank,
-                                 dp_rank,
-                                 bf16_mode):
+    def _get_rank_zero_ckpt_name(self, checkpoints_path, tag, mp_rank, dp_rank, bf16_mode):
         file_prefix = self._get_zero_ckpt_prefix(dp_rank, bf16_mode=bf16_mode)
         zero_ckpt_name = os.path.join(
             checkpoints_path,
@@ -2639,11 +2481,7 @@ class DeepSpeedEngine(Module):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
         pp_rank = dist.get_rank(group=self.optimizer.dp_process_group)
         bf16_mode = self.bfloat16_enabled()
-        return self._get_rank_zero_ckpt_name(checkpoints_path,
-                                             tag,
-                                             mp_rank,
-                                             pp_rank,
-                                             bf16_mode)
+        return self._get_rank_zero_ckpt_name(checkpoints_path, tag, mp_rank, pp_rank, bf16_mode)
 
     def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
         if mp_placeholder is not None:
@@ -2653,8 +2491,7 @@ class DeepSpeedEngine(Module):
             mp_rank_str = f"{mp_rank:02d}"
 
         if self.zero_optimization_partition_weights():
-            filename = "zero_pp_rank_{}".format(
-                dist.get_rank(group=self.optimizer.dp_process_group))
+            filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group))
             ckpt_name = os.path.join(
                 checkpoints_path,
                 str(tag),
@@ -2670,10 +2507,8 @@ class DeepSpeedEngine(Module):
 
     def _get_optimizer_ckpt_name(self, checkpoints_path, tag, expp_rank):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        ckpt_name = os.path.join(
-            checkpoints_path,
-            str(tag),
-            f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt')
+        ckpt_name = os.path.join(checkpoints_path, str(tag),
+                                 f'expp_rank_{expp_rank}_mp_rank_{mp_rank:02d}_optim_states.pt')
         return ckpt_name
 
     @staticmethod
@@ -2681,24 +2516,17 @@ class DeepSpeedEngine(Module):
         mp_rank = 0 if mpu is None else mpu.get_model_parallel_rank()
         if layer_id <= -1:
             # Used to support old checkpoint loading
-            ckpt_name = os.path.join(
-                checkpoints_path,
-                '' if tag is None else str(tag),
-                f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
+            ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+                                     f'expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
         else:
             # Used to support new checkpoint loading
-            ckpt_name = os.path.join(
-                checkpoints_path,
-                '' if tag is None else str(tag),
-                f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt'
-            )
+            ckpt_name = os.path.join(checkpoints_path, '' if tag is None else str(tag),
+                                     f'layer_{layer_id}_expert_{expert_id}_mp_rank_{mp_rank:02d}_model_states.pt')
         return ckpt_name
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
         # It is required that (checkpoints_path, tag) are consistent among all ranks.
-        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
-                                                tag,
-                                                mp_placeholder="*")
+        ckpt_file_pattern = self._get_ckpt_name(checkpoints_path, tag, mp_placeholder="*")
         import glob
 
         ckpt_files = glob.glob(ckpt_file_pattern)
@@ -2738,17 +2566,14 @@ class DeepSpeedEngine(Module):
         """
 
         if tag is None:
-            latest_tag = "latest_universal" if self.load_universal_checkpoint(
-            ) else "latest"
+            latest_tag = "latest_universal" if self.load_universal_checkpoint() else "latest"
             latest_path = os.path.join(load_dir, latest_tag)
             if os.path.isfile(latest_path):
                 with open(latest_path, "r") as fd:
                     tag = fd.read().strip()
             else:
                 if self.load_universal_checkpoint():
-                    raise ValueError(
-                        f'Invalid for universal checkpoint: {latest_path} does not exist'
-                    )
+                    raise ValueError(f'Invalid for universal checkpoint: {latest_path} does not exist')
                 else:
                     logger.warning(
                         f"Unable to find latest file at {latest_path}, if trying to load latest "
@@ -2770,10 +2595,7 @@ class DeepSpeedEngine(Module):
 
         load_zero_checkpoint = self.zero_optimization() or self.bfloat16_enabled()
         if load_zero_checkpoint and load_path is not None:
-            success = self._load_zero_checkpoint(
-                load_dir,
-                tag,
-                load_optimizer_states=load_optimizer_states)
+            success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states)
             if not success:
                 self.optimizer._restore_from_bit16_weights()
 
@@ -2794,16 +2616,12 @@ class DeepSpeedEngine(Module):
         from deepspeed.runtime.state_dict_factory import SDLoaderFactory
 
         ckpt_list = self._get_all_ckpt_names(load_dir, tag)
-        sd_loader = SDLoaderFactory.get_sd_loader(
-            ckpt_list,
-            checkpoint_engine=self.checkpoint_engine)
+        sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine)
 
         is_pipe_parallel = isinstance(self.module, PipelineModule)
 
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        load_path, checkpoint, _ = sd_loader.load(
-            self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel
-        )
+        load_path, checkpoint, _ = sd_loader.load(self.mp_world_size, mp_rank, is_pipe_parallel=is_pipe_parallel)
 
         if checkpoint is None:
             return None, None
@@ -2826,7 +2644,7 @@ class DeepSpeedEngine(Module):
                                                 num_experts=self.num_experts,
                                                 checkpoint_engine=self.checkpoint_engine)
         if not self.load_universal_checkpoint():
-            self.load_module_state_dict(state_dict=checkpoint['module'],
+            self.load_module_state_dict(checkpoint=checkpoint,
                                         strict=load_module_strict,
                                         custom_load_fn=custom_load_fn)
 
@@ -2841,38 +2659,29 @@ class DeepSpeedEngine(Module):
                 largest_group_name = groups._get_max_expert_size_name()
                 expp_rank = groups._get_expert_parallel_rank(largest_group_name)
                 optim_load_path = self._get_optimizer_ckpt_name(load_dir, tag, expp_rank)
-                optim_checkpoint = self.checkpoint_engine.load(
-                    optim_load_path,
-                    map_location=torch.device('cpu'))
+                optim_checkpoint = self.checkpoint_engine.load(optim_load_path, map_location=torch.device('cpu'))
             else:
                 optim_checkpoint = checkpoint
 
-            has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled(
-            )
+            has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
             if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state:
                 if self.fp16_enabled():
-                    self.optimizer.load_state_dict(
-                        optim_checkpoint['optimizer'],
-                        load_optimizer_states=load_optimizer_states)
+                    self.optimizer.load_state_dict(optim_checkpoint['optimizer'],
+                                                   load_optimizer_states=load_optimizer_states)
                 else:
                     self.optimizer.load_state_dict(optim_checkpoint['optimizer'])
 
             if load_lr_scheduler_states and self.lr_scheduler is not None:
                 self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
 
-            if self.random_ltd_enabled(
-            ) and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
+            if self.random_ltd_enabled() and self.random_ltd_scheduler is not None and 'random_ltd' in checkpoint:
                 self.random_ltd_scheduler.load_state_dict(checkpoint['random_ltd'])
 
             if self.training_dataloader is not None and self.curriculum_learning_enabled(
             ) and 'data_sampler' in checkpoint:
-                self.training_dataloader.data_sampler.load_state_dict(
-                    checkpoint['data_sampler'])
+                self.training_dataloader.data_sampler.load_state_dict(checkpoint['data_sampler'])
 
-            def get_sparse_tensor_module_names(original_set,
-                                               loaded_set,
-                                               original_parameters,
-                                               loaded_parameters):
+            def get_sparse_tensor_module_names(original_set, loaded_set, original_parameters, loaded_parameters):
                 result = set()
 
                 for name in original_set:
@@ -2882,8 +2691,7 @@ class DeepSpeedEngine(Module):
 
                 for name in loaded_set:
                     if name in original_parameters:
-                        result.add(
-                            name)  # parameter exists in both configs and it was sparse
+                        result.add(name)  # parameter exists in both configs and it was sparse
 
                 return result
 
@@ -2898,26 +2706,16 @@ class DeepSpeedEngine(Module):
                     self.sparse_tensor_module_names = sparse_tensor_module_names
                 else:
                     self.sparse_tensor_module_names = get_sparse_tensor_module_names(
-                        self.sparse_tensor_module_names,
-                        sparse_tensor_module_names,
-                        dict(self.module.named_parameters()),
-                        checkpoint["module"])
+                        self.sparse_tensor_module_names, sparse_tensor_module_names,
+                        dict(self.module.named_parameters()), checkpoint["module"])
 
             self.global_steps = checkpoint['global_steps']
-            self.global_samples = checkpoint.get(
-                'global_samples',
-                self.global_steps * self.train_batch_size())
+            self.global_samples = checkpoint.get('global_samples', self.global_steps * self.train_batch_size())
             self.skipped_steps = checkpoint['skipped_steps']
             self.loaded_checkpoint_mp_world_size = checkpoint['mp_world_size']
             deepspeed_states = [
-                'module',
-                'sparse_tensor_module_names',
-                'skipped_steps',
-                'global_steps',
-                'dp_world_size',
-                'mp_world_size',
-                'data_sampler',
-                'random_ltd'
+                'module', 'sparse_tensor_module_names', 'skipped_steps', 'global_steps', 'dp_world_size',
+                'mp_world_size', 'data_sampler', 'random_ltd'
             ]
         client_state = {}
 
@@ -2926,11 +2724,7 @@ class DeepSpeedEngine(Module):
         if load_optimizer_states:
             deepspeed_states.append('optimizer')
 
-        client_state = {
-            key: value
-            for key,
-            value in checkpoint.items() if not key in deepspeed_states
-        }
+        client_state = {key: value for key, value in checkpoint.items() if not key in deepspeed_states}
 
         if not load_optimizer_states and not load_module_only:
             client_state['optimizer'] = optim_checkpoint['optimizer']
@@ -2953,28 +2747,18 @@ class DeepSpeedEngine(Module):
             if zero_sd_list is None:
                 return False
 
-        self.optimizer.load_state_dict(
-            state_dict_list=zero_sd_list,
-            load_optimizer_states=load_optimizer_states,
-            load_from_fp32_weights=self.zero_load_from_fp32_weights(),
-            checkpoint_folder=checkpoint_folder)
+        self.optimizer.load_state_dict(state_dict_list=zero_sd_list,
+                                       load_optimizer_states=load_optimizer_states,
+                                       load_from_fp32_weights=self.zero_load_from_fp32_weights(),
+                                       checkpoint_folder=checkpoint_folder)
 
         if self.load_universal_checkpoint():
-            logger.info(
-                f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}'
-            )
+            logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}')
         else:
-            logger.info(
-                f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}"
-            )
+            logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}")
         return True
 
-    def _get_mp_rank_zero_checkpoint_names(self,
-                                           load_dir,
-                                           tag,
-                                           mp_rank,
-                                           dp_world_size,
-                                           bf16_mode):
+    def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode):
         zero_ckpt_names = []
         for dp_rank in range(dp_world_size):
             ckpt_name = self._get_rank_zero_ckpt_name(checkpoints_path=load_dir,
@@ -2988,18 +2772,16 @@ class DeepSpeedEngine(Module):
 
     def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode):
         mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
-        zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(
-            load_dir=load_dir,
-            tag=tag,
-            mp_rank=mp_rank,
-            dp_world_size=self.loaded_checkpoint_dp_world_size,
-            bf16_mode=bf16_mode)
+        zero_ckpt_names = self._get_mp_rank_zero_checkpoint_names(load_dir=load_dir,
+                                                                  tag=tag,
+                                                                  mp_rank=mp_rank,
+                                                                  dp_world_size=self.loaded_checkpoint_dp_world_size,
+                                                                  bf16_mode=bf16_mode)
         for i, ckpt_name in enumerate(zero_ckpt_names):
             if not os.path.exists(ckpt_name):
                 # transparently handle the old file pattern for optim_states
                 if "optim_states.pt" in ckpt_name:
-                    ckpt_name_try = ckpt_name.replace("_optim_states.pt",
-                                                      "optim_states.pt")
+                    ckpt_name_try = ckpt_name.replace("_optim_states.pt", "optim_states.pt")
                     if os.path.exists(ckpt_name_try):
                         zero_ckpt_names[i] = ckpt_name_try
                         continue
@@ -3013,8 +2795,7 @@ class DeepSpeedEngine(Module):
             if ckpt_name is None:
                 _state = {OPTIMIZER_STATE_DICT: None}
             # Fully load state for current rank
-            elif self.zero_elastic_checkpoint() or dist.get_rank(
-                    group=self.optimizer.dp_process_group) == i:
+            elif self.zero_elastic_checkpoint() or dist.get_rank(group=self.optimizer.dp_process_group) == i:
                 _state = self.checkpoint_engine.load(
                     ckpt_name,
                     map_location='cpu',
@@ -3024,25 +2805,18 @@ class DeepSpeedEngine(Module):
             zero_sd_list.append(_state)
 
         zero_optimizer_sd = [sd[OPTIMIZER_STATE_DICT] for sd in zero_sd_list]
-        logger.info(
-            f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}"
-        )
+        logger.info(f"successfully read {len(zero_optimizer_sd)} ZeRO state_dicts for rank {self.global_rank}")
         return zero_optimizer_sd
 
     def _get_all_zero_checkpoints(self, load_dir, tag):
         for bf16_mode in [self.bfloat16_enabled(), not self.bfloat16_enabled()]:
-            zero_ckpt_names = self._get_all_zero_checkpoint_names(
-                load_dir,
-                tag,
-                bf16_mode)
+            zero_ckpt_names = self._get_all_zero_checkpoint_names(load_dir, tag, bf16_mode)
             if zero_ckpt_names is not None:
                 # Warn if loading checkpoint of different bit16 type
                 if bf16_mode is not self.bfloat16_enabled():
                     checkpoint_bit16 = BFLOAT16 if bf16_mode else FP16
                     engine_bit16 = BFLOAT16 if self.bfloat16_enabled() else FP16
-                    logger.warn(
-                        f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine'
-                    )
+                    logger.warn(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine')
                 return self._get_all_zero_checkpoint_state_dicts(zero_ckpt_names)
 
         return None
@@ -3056,10 +2830,9 @@ class DeepSpeedEngine(Module):
             dist.all_reduce(max_bhash, op=dist.ReduceOp.MAX)
             dist.all_reduce(min_bhash, op=dist.ReduceOp.MIN)
             valid = all(min_bhash == bhash) and all(max_bhash == bhash)
-            msg = (
-                f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
-                "all ranks. Including rank unique information in checkpoint tag could cause issues when "
-                "restoring with different world sizes.")
+            msg = (f"[rank={dist.get_rank()}] The checkpoint tag name '{tag}' is not consistent across "
+                   "all ranks. Including rank unique information in checkpoint tag could cause issues when "
+                   "restoring with different world sizes.")
             if self.checkpoint_tag_validation_fail():
                 assert valid, msg
             elif not valid:
@@ -3090,7 +2863,7 @@ class DeepSpeedEngine(Module):
         # There seems to be issue creating them in parallel
 
         # Ensure save_dir directory exists
-        os.makedirs(save_dir, exist_ok=True)
+        self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
         dist.barrier()
 
         if tag is None:
@@ -3191,15 +2964,9 @@ class DeepSpeedEngine(Module):
                 # let save the moe parameters
                 for global_expert_id, expert_state_dict in experts_state_dict.items():
                     # save the moe parameters
-                    moe_save_path = self._get_expert_ckpt_name(
-                        save_dir,
-                        moe_layer_id,
-                        global_expert_id,
-                        tag,
-                        self.mpu)
+                    moe_save_path = self._get_expert_ckpt_name(save_dir, moe_layer_id, global_expert_id, tag, self.mpu)
                     if self.random_ltd_enabled():
-                        expert_state_dict = remove_random_ltd_state_dict(
-                            expert_state_dict)
+                        expert_state_dict = remove_random_ltd_state_dict(expert_state_dict)
                     self.checkpoint_engine.save(expert_state_dict, moe_save_path)
                 moe_layer_id += 1
 
@@ -3217,9 +2984,7 @@ class DeepSpeedEngine(Module):
 
         # Save optimizer states. They are different across each exp parallel rank.
         optimizer_state = {
-            'optimizer':
-            self.optimizer.state_dict()
-            if self.optimizer and not self.zero_optimization() else None
+            'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None
         }
         # TODO: why use BufferedWriter not the path
         file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
@@ -3234,15 +2999,12 @@ class DeepSpeedEngine(Module):
                 'module':
                 model_state_dict,
                 'lr_scheduler':
-                self.lr_scheduler.state_dict()
-                if self.lr_scheduler is not None else None,
+                self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
                 'data_sampler':
                 self.training_dataloader.data_sampler.state_dict() if
-                (self.training_dataloader is not None
-                 and self.curriculum_learning_enabled()) else None,
+                (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
                 'random_ltd':
-                self.random_ltd_scheduler.state_dict()
-                if self.random_ltd_enabled() else None,
+                self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
                 'sparse_tensor_module_names':
                 self.sparse_tensor_module_names,
                 'skipped_steps':
@@ -3264,11 +3026,11 @@ class DeepSpeedEngine(Module):
         self._curr_save_path = None
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
-        name_function = (self._get_zero_ckpt_name
-                         if zero_checkpoint else self._get_ckpt_name)
+        name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name)
         try:
             checkpoint_name = name_function(save_dir, tag)
-            ensure_directory_exists(checkpoint_name)
+            path = os.path.dirname(checkpoint_name)
+            self.checkpoint_engine.makedirs(path, exist_ok=True)
         except:
             logger.error(f"Failed saving model checkpoint to {save_dir} with tag {tag}")
             return False
@@ -3292,6 +3054,8 @@ class DeepSpeedEngine(Module):
 
         zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
 
+        save_frozen_param = self.zero_optimization_partition_gradients()
+
         # A hack to save the checkpointing directory. Pipeline parallelism overrides
         # module_state_dict() and uses this path to save the model. module_state_dict()
         # then instead just returns None.  The module_state_dict() implementation in
@@ -3302,17 +3066,17 @@ class DeepSpeedEngine(Module):
 
         state = dict(module=module,
                      buffer_names=self._get_buffer_names(),
-                     optimizer=self.optimizer.state_dict()
-                     if self.optimizer and not zero_optimizer_state else None,
-                     param_shapes=self._get_zero_param_shapes()
-                     if self.optimizer and zero_optimizer_state else None,
-                     lr_scheduler=self.lr_scheduler.state_dict()
-                     if self.lr_scheduler is not None else None,
+                     optimizer=self.optimizer.state_dict() if self.optimizer and not zero_optimizer_state else None,
+                     param_shapes=self._get_zero_param_shapes() if self.optimizer and zero_optimizer_state else None,
+                     frozen_param_shapes=self._get_zero_frozen_param_attributes(self._get_param_shape_func)
+                     if save_frozen_param else None,
+                     shared_params=self._get_shared_params() if self.optimizer and zero_optimizer_state else None,
+                     frozen_param_fragments=self._get_zero_frozen_param_attributes(self._get_param_fragment_func)
+                     if save_frozen_param else None,
+                     lr_scheduler=self.lr_scheduler.state_dict() if self.lr_scheduler is not None else None,
                      data_sampler=self.training_dataloader.data_sampler.state_dict() if
-                     (self.training_dataloader is not None
-                      and self.curriculum_learning_enabled()) else None,
-                     random_ltd=self.random_ltd_scheduler.state_dict()
-                     if self.random_ltd_enabled() else None,
+                     (self.training_dataloader is not None and self.curriculum_learning_enabled()) else None,
+                     random_ltd=self.random_ltd_scheduler.state_dict() if self.random_ltd_enabled() else None,
                      sparse_tensor_module_names=self.sparse_tensor_module_names,
                      skipped_steps=self.skipped_steps,
                      global_steps=self.global_steps,
@@ -3348,6 +3112,25 @@ class DeepSpeedEngine(Module):
 
         return buffer_names
 
+    def _get_param_shape_func(self, param):
+        return param.ds_shape if hasattr(param, 'ds_id') else param.shape
+
+    def _get_param_fragment_func(self, param):
+        return param.ds_tensor.detach().cpu() if hasattr(param, 'ds_id') else param.detach().cpu()
+
+    def _get_zero_frozen_param_attributes(self, attr_func):
+        frozen_param_fragments = OrderedDict()
+
+        for param in self.module.parameters():
+            if param.requires_grad:
+                continue
+            if param not in self.param_names:
+                raise ValueError(f"failed to find frozen {param} in named params")
+            name = self.param_names[param]
+            frozen_param_fragments[name] = attr_func(param)
+
+        return frozen_param_fragments
+
     def _get_zero_param_shapes(self):
         """Returns a dict of name to shape mapping, only for the flattened fp32 weights saved by the
         optimizer. the names are exactly as in state_dict. The order is absolutely important, since
@@ -3390,6 +3173,40 @@ class DeepSpeedEngine(Module):
 
         return param_group_shapes
 
+    def _get_shared_params(self):
+        """
+        Returns a dict of shared params, which can later be used to reconstruct the original state dict,
+        e.g. in `zero_to_fp32`. Each dict entry is a pair of param names, where the key is the name
+        of the variable that isn't stored and the value is the actual param holding data.
+        """
+        shared_ds_ids = {}
+        shared_params_by_full_name = {}
+
+        def get_layer_state_dict(module, prefix=""):
+            # handle params
+            for name, param in module.named_parameters(recurse=False):
+                if param is None or not hasattr(param, "ds_id"):
+                    continue
+                key = prefix + name
+                # can't rely on param.data_ptr() as it will be reused as weights gets
+                # gathered and reduced, but param.ds_id is unique across all zero weights
+                # (and shared params will have the same param.ds_id)
+                if param.ds_id in shared_ds_ids:
+                    # shared weights
+                    #print(f"`{key}` is shared with `{shared_ds_ids[param.ds_id]}`")
+                    shared_params_by_full_name[key] = shared_ds_ids[param.ds_id]
+                else:
+                    shared_ds_ids[param.ds_id] = key
+
+            for name, child in module.named_children():
+                if child is not None:
+                    get_layer_state_dict(child, prefix + name + ".")
+
+        if dist.get_rank() == 0:
+            get_layer_state_dict(self.module, prefix="")
+
+        return shared_params_by_full_name
+
     def _copy_recovery_script(self, save_path):
         base_dir = os.path.dirname(os.path.dirname(__file__))
         script = "zero_to_fp32.py"
@@ -3402,9 +3219,7 @@ class DeepSpeedEngine(Module):
 
     def _save_zero_checkpoint(self, save_path, tag):
         zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
-        zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
-                       ds_config=self.config,
-                       ds_version=version)
+        zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version)
         self.checkpoint_engine.save(zero_sd, zero_checkpoint_name)
 
         if self.global_rank == 0:
@@ -3434,9 +3249,7 @@ class DeepSpeedEngine(Module):
             # gather one layer at a time to be memory-efficient
             # must use modifier_rank=0 to release GPU memory after each layer gathered
             #see_memory_usage("before GatheredParameters", force=True)
-            with deepspeed.zero.GatheredParameters(list(
-                    module.parameters(recurse=False)),
-                                                   modifier_rank=0):
+            with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
                 if dist.get_rank() == 0:
                     # handle params
                     for name, param in module.named_parameters(recurse=False):
@@ -3457,8 +3270,7 @@ class DeepSpeedEngine(Module):
 
                     # now buffers - not sure if need to take care of potentially shared weights here
                     for name, buf in module.named_buffers(recurse=False):
-                        if (buf is not None
-                                and name not in module._non_persistent_buffers_set):
+                        if (buf is not None and name not in module._non_persistent_buffers_set):
                             state_dict[prefix + name] = buf.detach().cpu()
             #see_memory_usage("after GatheredParameters", force=True)
 
@@ -3511,15 +3323,29 @@ class DeepSpeedEngine(Module):
             else:
                 # the model will be bogus if not consolidated so don't confuse the user by saving it
                 logger.info(
-                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False"
-                )
+                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False")
                 return False
         else:
             state_dict = self.module.state_dict()
 
+        tag = f"global_step{self.global_steps}"
+        tag = str(tag)
+        self.checkpoint_engine.create(tag)
+
         if dist.get_rank() == 0:
-            os.makedirs(save_dir, exist_ok=True)
-            logger.info(f"Saving model weights to {path}")
+            self.checkpoint_engine.makedirs(save_dir, exist_ok=True)
+            logger.info(f"Saving model weights to {path}, tag: {tag}")
             self.checkpoint_engine.save(state_dict, path)
 
+        self.checkpoint_engine.commit(tag)
+
         return True
+
+    def empty_partition_cache(self):
+        """
+        Release GPU memory consumed by offloaded model parameters.
+        """
+        if hasattr(self.optimizer, 'empty_partition_cache'):
+            self.optimizer.empty_partition_cache()
+            gc.collect()
+            get_accelerator().empty_cache()
diff --git a/deepspeed/runtime/fp16/__init__.py b/deepspeed/runtime/fp16/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/deepspeed/runtime/fp16/__init__.py
+++ b/deepspeed/runtime/fp16/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index 4f4b5cfa7f2dbf5c088e4f91f5285bf9acd1062e..7fb9c9daf5c921ad6c1e498d6e92e37a7532acf0 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from FP16_Optimizer in NVIDIA/apex
-'''
+"""
 
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
@@ -23,6 +25,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
 
    For usage example please see, TODO:  DeepSpeed V2 Tutorial
     """
+
     def __init__(self,
                  init_optimizer,
                  deepspeed=None,
@@ -58,20 +61,15 @@ class FP16_Optimizer(DeepSpeedOptimizer):
             # push this group to list before modify
             self.fp16_groups.append(param_group['params'])
             # init fp16 weight buffer, flattened
-            self.fp16_groups_flat.append(
-                _flatten_dense_tensors([p.clone().detach()
-                                        for p in self.fp16_groups[i]]))
+            self.fp16_groups_flat.append(_flatten_dense_tensors([p.clone().detach() for p in self.fp16_groups[i]]))
             # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
             # init master weight, flattened
-            self.fp32_groups_flat.append(
-                self.fp16_groups_flat[i].clone().float().detach())
+            self.fp32_groups_flat.append(self.fp16_groups_flat[i].clone().float().detach())
             # modify optimizer of have flat master weight
-            self.fp32_groups_flat[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
+            self.fp32_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
             param_group['params'] = [self.fp32_groups_flat[i]]
 
         # we may have a way of fusing dynamic scale. Do not support for now
@@ -113,16 +111,13 @@ class FP16_Optimizer(DeepSpeedOptimizer):
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups,
-                                              mpu=self.mpu,
-                                              deepspeed=deepspeed)
+        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed)
         self.initialize_optimizer_states()
 
     def initialize_optimizer_states(self):
         for i, group in enumerate(self.fp16_groups):
-            self.fp32_groups_flat[i].grad = torch.zeros(
-                self.fp32_groups_flat[i].size(),
-                device=self.fp32_groups_flat[i].device)
+            self.fp32_groups_flat[i].grad = torch.zeros(self.fp32_groups_flat[i].size(),
+                                                        device=self.fp32_groups_flat[i].device)
 
         self.optimizer.step()
 
@@ -156,10 +151,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
         for i, group in enumerate(self.fp16_groups):
             grads_groups_flat.append(
                 _flatten_dense_tensors([
-                    torch.zeros(p.size(),
-                                dtype=p.dtype,
-                                device=p.device) if p.grad is None else p.grad
-                    for p in group
+                    torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group
                 ]))
             norm_groups.append(get_weight_norm(grads_groups_flat[i], mpu=self.mpu))
 
@@ -169,17 +161,13 @@ class FP16_Optimizer(DeepSpeedOptimizer):
 
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         scaled_grad_norm = get_global_norm(norm_list=norm_groups)
 
-        combined_scale = self.unscale_and_clip_grads(grads_groups_flat,
-                                                     scaled_grad_norm,
-                                                     apply_scale=False)
+        combined_scale = self.unscale_and_clip_grads(grads_groups_flat, scaled_grad_norm, apply_scale=False)
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_grad_norm / self.cur_scale
@@ -191,8 +179,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
                             grad_norms=norm_groups)
         # TODO: we probably don't need this? just to be safe
         for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data = q.data
         return self.overflow
@@ -222,9 +209,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -273,10 +258,8 @@ class FP16_Optimizer(DeepSpeedOptimizer):
 
             grads_groups_flat.append(
                 _flatten_dense_tensors([
-                    torch.zeros(p.size(),
-                                dtype=data_type,
-                                device=p.device)
-                    if p.grad is None else p.grad.to(data_type) for p in group
+                    torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type)
+                    for p in group
                 ]))
 
             for p in group:
@@ -313,8 +296,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
         self.start_timers([UPDATE_FP16])
 
         for i in range(len(self.fp16_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i],
-                                                      self.fp16_groups[i])
+            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data.copy_(q.data)
 
@@ -334,9 +316,7 @@ class FP16_Optimizer(DeepSpeedOptimizer):
         else:
             pg = groups._get_data_parallel_group()
         scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg))
-        scaled_norm_tensor = torch.tensor(scaled_norm,
-                                          device=self.fp32_groups_flat[0].device,
-                                          dtype=torch.float)
+        scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[0].device, dtype=torch.float)
         dist.all_reduce(scaled_norm_tensor, group=pg)
         all_groups_norm = scaled_norm_tensor.item()
         #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
@@ -376,25 +356,19 @@ class FP16_Optimizer(DeepSpeedOptimizer):
         if self.dynamic_loss_scale:
             prev_scale = self.cur_scale
             if skip:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_loss_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale)
                 self.last_overflow_iter = self.cur_iter
                 if self.verbose:
                     logger.info(f"\nGrad overflow on iteration {self.cur_iter}")
-                    logger.info(
-                        f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                    )
+                    logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}")
             else:
                 # Ensure self.scale_window updates since last overflow
                 stable_interval = (self.cur_iter - self.last_overflow_iter) - 1
                 if (stable_interval > 0) and (stable_interval % self.scale_window == 0):
                     self.cur_scale *= self.scale_factor
                     if self.verbose:
-                        logger.info(
-                            f"No Grad overflow for {self.scale_window} iterations")
-                        logger.info(
-                            f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                        )
+                        logger.info(f"No Grad overflow for {self.scale_window} iterations")
+                        logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}")
         else:
             if skip:
                 logger.info("Grad overflow on iteration: %s", self.cur_iter)
diff --git a/deepspeed/runtime/fp16/loss_scaler.py b/deepspeed/runtime/fp16/loss_scaler.py
index 58ab2ae96fe1398887f39b81b599b0418f9a5769..e12ee92fdf98aee0d6dd9df30bcee6719d3e921b 100755
--- a/deepspeed/runtime/fp16/loss_scaler.py
+++ b/deepspeed/runtime/fp16/loss_scaler.py
@@ -1,22 +1,29 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
-# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#Taken and modified for DeepSpeed from:
-#    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
-#Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+Taken and modified for DeepSpeed from:
+    https://github.com/NVIDIA/Megatron-LM/blob/master/fp16/loss_scaler.py
+Commit: 93ab4bea59dc5cbf97c079d313741866af4deac9
+"""
 
 import torch
+from deepspeed import comm as dist
+from deepspeed.utils import logger
 
 INITIAL_LOSS_SCALE = 'init_scale'
 SCALE_WINDOW = 'scale_window'
@@ -35,6 +42,7 @@ class LossScalerBase:
     """LossScalarBase
     Base class for a loss scaler
     """
+
     def __init__(self, cur_scale):
         self.cur_scale = cur_scale
         self.dynamic = False
@@ -52,6 +60,7 @@ class LossScalerBase:
     def backward(self, loss, retain_graph=False):
         scaled_loss = loss * self.loss_scale
         scaled_loss.backward(retain_graph=retain_graph)
+        # print(f'LossScalerBackward: {scaled_loss=}')
 
 
 class LossScaler(LossScalerBase):
@@ -65,6 +74,7 @@ class LossScaler(LossScalerBase):
     Args:
         scale (float, optional, default=1.0):  The loss scale.
     """
+
     def __init__(self, scale=1):
         super(LossScaler, self).__init__(scale)
 
@@ -102,6 +112,7 @@ class DynamicLossScaler(LossScalerBase):
         scale_factor (float, optional, default=2.0):  Factor used when adjusting the loss scale. If an overflow is encountered, the loss scale is readjusted to loss scale/``scale_factor``.  If ``scale_window`` consecutive iterations take place without an overflow, the loss scale is readjusted to loss_scale*``scale_factor``.
         scale_window (int, optional, default=1000):  Number of consecutive iterations without an overflow to wait before increasing the loss scale.
     """
+
     def __init__(self,
                  init_scale=2**32,
                  scale_factor=2.,
@@ -109,7 +120,8 @@ class DynamicLossScaler(LossScalerBase):
                  min_scale=1,
                  delayed_shift=1,
                  consecutive_hysteresis=False,
-                 raise_error_at_min_scale=True):
+                 raise_error_at_min_scale=True,
+                 dtype=torch.half):
         super(DynamicLossScaler, self).__init__(init_scale)
         self.cur_iter = 0
         self.last_overflow_iter = -1
@@ -121,6 +133,7 @@ class DynamicLossScaler(LossScalerBase):
         self.consecutive_hysteresis = consecutive_hysteresis
         self.raise_error_at_min_scale = raise_error_at_min_scale
         self.dynamic = True
+        self.dtype = dtype
 
     # `params` is a list / generator of torch.Variable
     def has_overflow_serial(self, params):
@@ -158,10 +171,21 @@ class DynamicLossScaler(LossScalerBase):
             if self.delayed_shift == 1 or self.cur_hysteresis == 1:
                 if (self.cur_scale == self.min_scale) and self.raise_error_at_min_scale:
                     raise Exception(
-                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run."
-                    )
-                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
+                        "Current loss scale already at minimum - cannot decrease scale anymore. Exiting run.")
+                else:
+                    next_scale = max(self.cur_scale / self.scale_factor, self.min_scale)
+                    if dist.get_rank() == 0:
+                        overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+                        if self.dtype == torch.half:
+                            overflow_msg += f" Attempted loss scale: {int(self.cur_scale)}, reducing to {int(next_scale)}"
+                        logger.info(overflow_msg)
+                    self.cur_scale = next_scale
             else:
+                if dist.get_rank() == 0:
+                    overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
+                    if self.dtype == torch.half:
+                        overflow_msg += f" Attempted loss scale: {int(self.cur_scale)}, but hysteresis is {self.cur_hysteresis}. Reducing hysteresis to {self.cur_hysteresis-1}"
+                    logger.info(overflow_msg)
                 self.cur_hysteresis -= 1
             self.last_overflow_iter = self.cur_iter
         else:
@@ -179,8 +203,8 @@ class DynamicLossScaler(LossScalerBase):
 def CreateLossScaler(dtype, static_loss_scale, dynamic_scaling, dynamic_loss_args):
     if dtype == torch.half and dynamic_scaling:
         if dynamic_loss_args is None:
-            return DynamicLossScaler()
-        return DynamicLossScaler(**dynamic_loss_args)
+            return DynamicLossScaler(dtype=dtype)
+        return DynamicLossScaler(dtype=dtype, **dynamic_loss_args)
 
     loss_scale_value = static_loss_scale if dtype == torch.half else 1.0
     return LossScaler(scale=loss_scale_value)
diff --git a/deepspeed/runtime/fp16/onebit/__init__.py b/deepspeed/runtime/fp16/onebit/__init__.py
index 289769b423ced34cb6753df6bfc1dc6485e3c01d..ac6f7e9784ce60f6c1b4a9134b73b4e415337641 100644
--- a/deepspeed/runtime/fp16/onebit/__init__.py
+++ b/deepspeed/runtime/fp16/onebit/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .adam import OnebitAdam
 from .lamb import OnebitLamb
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 5eb22fb64d73c3a716ac427f470e18e593b6f79f..3854e2d2cd6672f048a1829f6b6e004d5fb993ff 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
@@ -39,14 +41,14 @@ class OnebitAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  freeze_step=100000,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -89,11 +91,12 @@ class OnebitAdam(torch.optim.Optimizer):
         if self.comm_backend_name == 'nccl':
             TORCH_MAJOR = int(torch.__version__.split('.')[0])
             TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                (TORCH_MAJOR == 1 and TORCH_MINOR >= 8) or TORCH_MAJOR >= 2
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -164,22 +167,17 @@ class OnebitAdam(torch.optim.Optimizer):
                     # Exponential moving average of squared gradient values
                     state['exp_avg_sq'] = torch.zeros_like(p.data)
 
-                if not self.initialize or (self.adam_freeze_key
-                                           and 'worker_error' not in state.keys()):
+                if not self.initialize or (self.adam_freeze_key and 'worker_error' not in state.keys()):
                     state['tensor_size'] = torch.numel(p.data)
                     state['corrected_tensor_size'] = state['tensor_size']
 
                     if state['tensor_size'] % (self.size * self.divider) != 0:
-                        state['corrected_tensor_size'] += ((self.size * self.divider) -
-                                                           (state['tensor_size'] %
-                                                            (self.size * self.divider)))
-                    state['server_chunk_size'] = state[
-                        'corrected_tensor_size'] // self.size
+                        state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] %
+                                                                                         (self.size * self.divider)))
+                    state['server_chunk_size'] = state['corrected_tensor_size'] // self.size
                     get_accelerator().empty_cache()
-                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
-                                                        device=p.device)
-                    state['server_error'] = torch.zeros(state['server_chunk_size'],
-                                                        device=p.device)
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device)
                     get_accelerator().empty_cache()
                     self.adam_freeze_key = True
                     if not self.initialize and dist.get_rank() == 0:
@@ -211,11 +209,9 @@ class OnebitAdam(torch.optim.Optimizer):
 
                         if self.size > 1:
                             exp_avg.set_(
-                                self.comm_backend_handle.compressed_allreduce(
-                                    exp_avg,
-                                    state['worker_error'],
-                                    state['server_error'],
-                                    self.deepspeed.local_rank))
+                                self.comm_backend_handle.compressed_allreduce(exp_avg, state['worker_error'],
+                                                                              state['server_error'],
+                                                                              self.deepspeed.local_rank))
                         # Because 1-bit compression cannot represent exact zero, it is required to
                         # provide a momentum mask for those params that have constant exact zeros in their
                         # momentums, otherwise the compression error would keep accumulating.
@@ -225,8 +221,7 @@ class OnebitAdam(torch.optim.Optimizer):
                         # (See example in DeepSpeedExamples/bing_bert/deepspeed_train.py.)
                         if 'exp_avg_mask' in group:
                             if exp_avg.device != group['exp_avg_mask'].device:
-                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                    device=exp_avg.device)
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device)
                             exp_avg.mul_(group['exp_avg_mask'])
 
                     if self.initialize:
@@ -272,8 +267,7 @@ class OnebitAdam(torch.optim.Optimizer):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.freeze_step:
@@ -287,9 +281,7 @@ class OnebitAdam(torch.optim.Optimizer):
                     self.deepspeed.enable_backward_allreduce = True
         else:
             if dist.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and OnebitAdam compression stage starts/continues."
-                )
+                print("Checkpoint loaded and OnebitAdam compression stage starts/continues.")
             if self.adam_freeze_key is False:
                 self.adam_freeze_key = True
                 if self.using_pipeline:
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index 87c24695e23daffef851fbf7c30d435b167e083f..e8a45480701ffdee96d80602dbe7212f62016eac 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
@@ -54,14 +56,14 @@ class OnebitLamb(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  freeze_step=100000,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -111,11 +113,12 @@ class OnebitLamb(torch.optim.Optimizer):
         if self.comm_backend_name == 'nccl':
             TORCH_MAJOR = int(torch.__version__.split('.')[0])
             TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                (TORCH_MAJOR == 1 and TORCH_MINOR >= 8) or TORCH_MAJOR >= 2
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -165,24 +168,20 @@ class OnebitLamb(torch.optim.Optimizer):
         if self.lamb_freeze_key:
             exp_avg_last_step = []
             for group in self.param_groups:
-                exp_avg_last_step.append(
-                    [self.state[p]['exp_avg'].detach().clone() for p in group['params']])
+                exp_avg_last_step.append([self.state[p]['exp_avg'].detach().clone() for p in group['params']])
             if 'scaling_coeff' not in self.state[self.param_groups[0]['params'][0]]:
                 # Compute the scaling_coeff for each momentum at the end of warmup stage.
                 # This is used to reduce compression error during compression stage.
                 momentum_scales = []
                 for group in self.param_groups:
                     momentum_scales.append([
-                        (torch.norm(self.state[p]['exp_avg']) /
-                         np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
+                        (torch.norm(self.state[p]['exp_avg']) / np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
                         for p in group['params']
                     ])
-                united_scale = sum([sum(x) for x in momentum_scales]) / sum(
-                    [len(x) for x in momentum_scales])
+                united_scale = sum([sum(x) for x in momentum_scales]) / sum([len(x) for x in momentum_scales])
                 for i, group in enumerate(self.param_groups):
                     for j, p in enumerate(group['params']):
-                        self.state[p][
-                            'scaling_coeff'] = united_scale / momentum_scales[i][j]
+                        self.state[p]['scaling_coeff'] = united_scale / momentum_scales[i][j]
 
         for group, grads_this_group in zip(self.param_groups, grads_group):
             if grads_this_group is None:
@@ -201,8 +200,7 @@ class OnebitLamb(torch.optim.Optimizer):
                 state = self.state[p]
 
                 # State initialization
-                if len(state) == 0 or (len(state) == 1
-                                       and 'scaling_coeff' in state.keys()):
+                if len(state) == 0 or (len(state) == 1 and 'scaling_coeff' in state.keys()):
                     state['step'] = 0
                     state['lamb_coeff_freeze'] = 0.0
                     state['last_factor'] = 1.0
@@ -215,7 +213,8 @@ class OnebitLamb(torch.optim.Optimizer):
                 if not self.initialize:
                     self.lamb_freeze_key = True
 
-                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[
+                    'exp_avg_sq_fresh']
                 beta1, beta2 = group['betas']
                 max_coeff = group['max_coeff']
                 min_coeff = group['min_coeff']
@@ -243,8 +242,8 @@ class OnebitLamb(torch.optim.Optimizer):
                             if lamb_coeff < min_coeff:
                                 lamb_coeff = min_coeff
                         if lamb_coeff != 1.0:
-                            state['lamb_coeff_freeze'] = self.coeff_beta * state[
-                                'lamb_coeff_freeze'] + (1 - self.coeff_beta) * lamb_coeff
+                            state['lamb_coeff_freeze'] = self.coeff_beta * state['lamb_coeff_freeze'] + (
+                                1 - self.coeff_beta) * lamb_coeff
                         self.lamb_coeffs.append(lamb_coeff)
                         with torch.no_grad():
                             p.add_(-group['lr'] * lamb_coeff * update)
@@ -266,20 +265,15 @@ class OnebitLamb(torch.optim.Optimizer):
                     tensor_size += torch.numel(p.data)
             corrected_tensor_size = tensor_size
             if tensor_size % (self.size * self.divider) != 0:
-                difference = ((self.size * self.divider) - (tensor_size %
-                                                            (self.size * self.divider)))
+                difference = ((self.size * self.divider) - (tensor_size % (self.size * self.divider)))
                 corrected_tensor_size += difference
-                self.dummy_exp_avg[0] = torch.zeros(
-                    difference,
-                    device=momentum_groups[0].data.device)
+                self.dummy_exp_avg[0] = torch.zeros(difference, device=momentum_groups[0].data.device)
                 momentum_groups.append(self.dummy_exp_avg[0])
             self.corrected_tensor_sizes.append(corrected_tensor_size)
             self.server_chunk_sizes.append(corrected_tensor_size // self.size)
 
-            self.exp_avg_flat.append(
-                _flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
-            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0],
-                                                      momentum_groups)
+            self.exp_avg_flat.append(_flatten_dense_tensors([p.detach().clone() for p in momentum_groups]))
+            updated_params = _unflatten_dense_tensors(self.exp_avg_flat[0], momentum_groups)
             for p, q in zip(momentum_groups, updated_params):
                 p.data = q.data
 
@@ -287,11 +281,8 @@ class OnebitLamb(torch.optim.Optimizer):
             get_accelerator().empty_cache()
             for i in range(len(self.exp_avg_flat)):
                 self.worker_errors.append(
-                    torch.zeros(self.corrected_tensor_sizes[i],
-                                device=self.exp_avg_flat[i].device))
-                self.server_errors.append(
-                    torch.zeros(self.server_chunk_sizes[i],
-                                device=self.exp_avg_flat[i].device))
+                    torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device))
+                self.server_errors.append(torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device))
             get_accelerator().empty_cache()
 
         if self.lamb_freeze_key:
@@ -300,31 +291,23 @@ class OnebitLamb(torch.optim.Optimizer):
                     if not self.initialize:
                         get_accelerator().empty_cache()
                         self.worker_errors.append(
-                            torch.zeros(self.corrected_tensor_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
+                            torch.zeros(self.corrected_tensor_sizes[i], device=self.exp_avg_flat[i].device))
                         self.server_errors.append(
-                            torch.zeros(self.server_chunk_sizes[i],
-                                        device=self.exp_avg_flat[i].device))
+                            torch.zeros(self.server_chunk_sizes[i], device=self.exp_avg_flat[i].device))
                         get_accelerator().empty_cache()
                         if dist.get_rank() == 0:
                             print("Cupy Buffers Initialized Successfully.")
 
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[0],
-                            self.server_errors[0],
-                            self.deepspeed.local_rank)
+                        self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[0],
+                                                                      self.server_errors[0], self.deepspeed.local_rank)
 
                         if dist.get_rank() == 0:
                             print('Pop out errors', flush=True)
                         del self.worker_errors[:]
                         del self.server_errors[:]
                     else:
-                        self.comm_backend_handle.compressed_allreduce(
-                            self.exp_avg_flat[i],
-                            self.worker_errors[i],
-                            self.server_errors[i],
-                            self.deepspeed.local_rank)
+                        self.comm_backend_handle.compressed_allreduce(self.exp_avg_flat[i], self.worker_errors[i],
+                                                                      self.server_errors[i], self.deepspeed.local_rank)
 
         if self.lamb_freeze_key and self.initialize:
             for i, group in enumerate(self.param_groups):
@@ -332,7 +315,8 @@ class OnebitLamb(torch.optim.Optimizer):
 
                 for j, p in enumerate(group['params']):
                     state = self.state[p]
-                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state['exp_avg_sq_fresh']
+                    exp_avg, exp_avg_sq, exp_avg_sq_fresh = state['exp_avg'], state['exp_avg_sq'], state[
+                        'exp_avg_sq_fresh']
                     beta1, beta2 = group['betas']
                     exp_avg.div_(self.state[p]['scaling_coeff'])
                     # Because 1-bit compression cannot represent exact zero, it is required to
@@ -345,15 +329,11 @@ class OnebitLamb(torch.optim.Optimizer):
                     # to add this exp_avg_mask for BERT pre-training.)
                     if 'exp_avg_mask' in group:
                         if exp_avg.device != group['exp_avg_mask'].device:
-                            group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                device=exp_avg.device)
+                            group['exp_avg_mask'] = group['exp_avg_mask'].to(device=exp_avg.device)
                         exp_avg.mul_(group['exp_avg_mask'])
 
-                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) /
-                                        (1 - beta1))
-                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2,
-                                                          grad_reconstruct,
-                                                          grad_reconstruct)
+                    grad_reconstruct = ((exp_avg - exp_avg_last_step[i][j] * beta1) / (1 - beta1))
+                    exp_avg_sq_fresh.mul_(beta2).addcmul_(1 - beta2, grad_reconstruct, grad_reconstruct)
                     denom = exp_avg_sq.sqrt() + group['eps']
                     update_prelim = exp_avg / denom
 
@@ -367,9 +347,7 @@ class OnebitLamb(torch.optim.Optimizer):
                     denom_real = exp_avg_sq_fresh.sqrt() + group['eps']
                     factor = (denom / denom_real).max().item()
                     if group['weight_decay'] > 0.0:
-                        update_ratio = min(1.0,
-                                           (update_prelim.pow(2).sum().sqrt() /
-                                            update_norm).item())
+                        update_ratio = min(1.0, (update_prelim.pow(2).sum().sqrt() / update_norm).item())
                         factor = factor * update_ratio + (1.0 - update_ratio)
                     if factor > self.factor_max:
                         factor = self.factor_max
@@ -416,8 +394,7 @@ class OnebitLamb(torch.optim.Optimizer):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         # need to reset the fused momentum since loading states will break the linking
@@ -442,9 +419,7 @@ class OnebitLamb(torch.optim.Optimizer):
                         self.state[p].pop('scaling_coeff')
         else:
             if dist.get_rank() == 0:
-                print(
-                    "Checkpoint loaded and OnebitLamb compression stage starts/continues."
-                )
+                print("Checkpoint loaded and OnebitLamb compression stage starts/continues.")
             if self.lamb_freeze_key is False:
                 self.lamb_freeze_key = True
                 if self.using_pipeline:
diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index f86ae86f36cb5d7863c440a51ddafeb80cbafdd7..fb2d2a061e381c4d8e26f9d177fbc60f05efb9a1 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 import torch
 import numpy as np
@@ -49,13 +51,13 @@ class ZeroOneAdam(torch.optim.Optimizer):
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
     """
+
     def __init__(self,
                  params,
                  deepspeed=None,
                  lr=1e-3,
                  bias_correction=True,
-                 betas=(0.9,
-                        0.999),
+                 betas=(0.9, 0.999),
                  eps=1e-8,
                  eps_inside_sqrt=False,
                  weight_decay=0.,
@@ -102,11 +104,12 @@ class ZeroOneAdam(torch.optim.Optimizer):
         if self.comm_backend_name == 'nccl':
             TORCH_MAJOR = int(torch.__version__.split('.')[0])
             TORCH_MINOR = int(torch.__version__.split('.')[1])
-            assert TORCH_MAJOR >= 1 and TORCH_MINOR >= 8, "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
+            assert (
+                (TORCH_MAJOR == 1 and TORCH_MINOR >= 8) or TORCH_MAJOR >= 2
+            ), "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
             assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
-            self.using_pipeline = hasattr(self.deepspeed,
-                                          'pipeline_enable_backward_allreduce')
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
 
         elif self.comm_backend_name == 'mpi':
@@ -181,16 +184,12 @@ class ZeroOneAdam(torch.optim.Optimizer):
                     state['corrected_tensor_size'] = state['tensor_size']
 
                     if state['tensor_size'] % (self.size * self.divider) != 0:
-                        state['corrected_tensor_size'] += ((self.size * self.divider) -
-                                                           (state['tensor_size'] %
-                                                            (self.size * self.divider)))
-                    state['server_chunk_size'] = state[
-                        'corrected_tensor_size'] // self.size
+                        state['corrected_tensor_size'] += ((self.size * self.divider) - (state['tensor_size'] %
+                                                                                         (self.size * self.divider)))
+                    state['server_chunk_size'] = state['corrected_tensor_size'] // self.size
                     get_accelerator().empty_cache()
-                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
-                                                        device=p.device)
-                    state['server_error'] = torch.zeros(state['server_chunk_size'],
-                                                        device=p.device)
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'], device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'], device=p.device)
                     # Accumulation of momentum, i.e., the u variable in the 0/1 Adam paper
                     state['momentum_accumulator'] = torch.zeros_like(p.data)
                     get_accelerator().empty_cache()
@@ -213,16 +212,10 @@ class ZeroOneAdam(torch.optim.Optimizer):
                             if self.size > 1:
                                 with torch.no_grad():
                                     grad_onebit = self.comm_backend_handle.compressed_allreduce(
-                                        grad,
-                                        state['worker_error'],
-                                        state['server_error'],
-                                        self.deepspeed.local_rank)
+                                        grad, state['worker_error'], state['server_error'], self.deepspeed.local_rank)
                                     if 'exp_avg_mask' in group:
-                                        if grad_onebit.device != group[
-                                                'exp_avg_mask'].device:
-                                            group['exp_avg_mask'] = group[
-                                                'exp_avg_mask'].to(
-                                                    device=grad_onebit.device)
+                                        if grad_onebit.device != group['exp_avg_mask'].device:
+                                            group['exp_avg_mask'] = group['exp_avg_mask'].to(device=grad_onebit.device)
                                         grad_onebit.mul_(group['exp_avg_mask'])
                                     exp_avg.mul_(beta1).add_(1 - beta1, grad_onebit)
                     else:
@@ -233,15 +226,12 @@ class ZeroOneAdam(torch.optim.Optimizer):
                 if not self.initialize:
                     if self.size > 1:
                         comm_buffer.set_(
-                            self.comm_backend_handle.compressed_allreduce(
-                                comm_buffer,
-                                state['worker_error'],
-                                state['server_error'],
-                                self.deepspeed.local_rank))
+                            self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'],
+                                                                          state['server_error'],
+                                                                          self.deepspeed.local_rank))
                         if 'exp_avg_mask' in group:
                             if comm_buffer.device != group['exp_avg_mask'].device:
-                                group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                    device=comm_buffer.device)
+                                group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device)
                             comm_buffer.mul_(group['exp_avg_mask'])
 
                 if self.initialize:
@@ -252,22 +242,18 @@ class ZeroOneAdam(torch.optim.Optimizer):
                         p.data.add_(-group['lr'] * update)
                         if self.freeze_key is True:
                             comm_buffer.add_(-group['lr'] * update)
-                    if state['step'] % state[
-                            'local_step_interval'] == 0 and self.freeze_key:
+                    if state['step'] % state['local_step_interval'] == 0 and self.freeze_key:
                         with torch.no_grad():
                             p.data.add_(-1 * comm_buffer)
                             comm_buffer.mul_(exp_avg_sq.sqrt() + group['eps'])
                             if self.size > 1:
                                 comm_buffer.copy_(
-                                    self.comm_backend_handle.compressed_allreduce(
-                                        comm_buffer,
-                                        state['worker_error'],
-                                        state['server_error'],
-                                        self.deepspeed.local_rank))
+                                    self.comm_backend_handle.compressed_allreduce(comm_buffer, state['worker_error'],
+                                                                                  state['server_error'],
+                                                                                  self.deepspeed.local_rank))
                                 if 'exp_avg_mask' in group:
                                     if comm_buffer.device != group['exp_avg_mask'].device:
-                                        group['exp_avg_mask'] = group['exp_avg_mask'].to(
-                                            device=comm_buffer.device)
+                                        group['exp_avg_mask'] = group['exp_avg_mask'].to(device=comm_buffer.device)
                                     comm_buffer.mul_(group['exp_avg_mask'])
                             exp_avg.zero_().add_(comm_buffer / state['lrs'], alpha=-1)
                             p.data.add_(comm_buffer / (exp_avg_sq.sqrt() + group['eps']))
@@ -298,9 +284,8 @@ class ZeroOneAdam(torch.optim.Optimizer):
                         state['local_step_counter'] += 1
                         if state['local_step_counter'] == self.local_step_scaler:
                             state['local_step_counter'] = 0
-                            state['local_step_interval'] = min(
-                                self.local_step_clipper,
-                                state['local_step_interval'] * 2)
+                            state['local_step_interval'] = min(self.local_step_clipper,
+                                                               state['local_step_interval'] * 2)
 
             if not self.initialize:
                 print('Pop out errors', flush=True)
@@ -343,14 +328,13 @@ class ZeroOneAdam(torch.optim.Optimizer):
         for i, group in enumerate(self.param_groups):
             if 'exp_avg_mask' in group:
                 state_dict['param_groups'][i]['exp_avg_mask'] = group['exp_avg_mask']
-            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict[
-                    'param_groups'][i]:
+            elif 'exp_avg_mask' not in group and 'exp_avg_mask' in state_dict['param_groups'][i]:
                 state_dict['param_groups'][i].pop('exp_avg_mask')
         super().load_state_dict(state_dict)
         if self.state[self.param_groups[0]['params'][0]]['step'] < self.var_freeze_step:
             self.var_freeze_key = False
-            if (self.state[self.param_groups[0]['params'][0]]['step'] + 1
-                ) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0:
+            if (self.state[self.param_groups[0]['params'][0]]['step'] +
+                    1) % self.state[self.param_groups[0]['params'][0]]['var_interval'] == 0:
                 if self.using_pipeline:
                     self.deepspeed.pipeline_enable_backward_allreduce = True
                 else:
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index e0249f15a6fb884fc59acaffa678e8f2ec96d134..1c57e20487717847d0bea1241199921fe9df77cb 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -1,9 +1,11 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/apex
 This file is adapted from FP16_Optimizer in NVIDIA/apex
-'''
+"""
 
 from deepspeed.moe.utils import split_params_grads_into_shared_and_expert_params
 import torch
@@ -24,6 +26,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
 
     For usage example please see, TODO:  DeepSpeed V2 Tutorial
     """
+
     def __init__(self,
                  init_optimizer,
                  deepspeed=None,
@@ -105,9 +108,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
         self.mpu = mpu
 
         self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups,
-                                              mpu=self.mpu,
-                                              deepspeed=deepspeed)
+        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu, deepspeed=deepspeed)
 
         self.initialize_optimizer_states()
 
@@ -137,45 +138,33 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
         expert_norm_groups = []
         for i, group in enumerate(self.fp16_groups):
             grads = [
-                torch.zeros(p.size(),
-                            dtype=p.dtype,
-                            device=p.device) if p.grad is None else p.grad for p in group
+                torch.zeros(p.size(), dtype=p.dtype, device=p.device) if p.grad is None else p.grad for p in group
             ]
             grads_groups.append(grads)
             grads_groups_flat.append(_flatten_dense_tensors(grads))
             grads_for_norm, expert_grads_for_norm = split_params_grads_into_shared_and_expert_params(group)
             norm_group_value = 0.0
             if len(grads_for_norm) > 0:
-                norm_group_value = get_weight_norm(
-                    _flatten_dense_tensors(grads_for_norm),
-                    mpu=self.mpu)
+                norm_group_value = get_weight_norm(_flatten_dense_tensors(grads_for_norm), mpu=self.mpu)
             norm_groups.append(norm_group_value)
             expert_norm_group_value = 0.0
             if len(expert_grads_for_norm) > 0:
-                expert_norm_group_value = get_weight_norm(
-                    _flatten_dense_tensors(expert_grads_for_norm),
-                    mpu=self.mpu)
+                expert_norm_group_value = get_weight_norm(_flatten_dense_tensors(expert_grads_for_norm), mpu=self.mpu)
             expert_norm_groups.append(expert_norm_group_value)
 
-        self.overflow = self.overflow_checker.check_using_norm(norm_groups +
-                                                               expert_norm_groups)
+        self.overflow = self.overflow_checker.check_using_norm(norm_groups + expert_norm_groups)
         prev_scale = self.cur_scale
 
         self._update_scale(self.overflow)
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         self._global_grad_norm = get_global_norm(norm_list=norm_groups)
-        combined_scale = self.unscale_and_clip_grads(self._global_grad_norm,
-                                                     apply_scale=False)
-        self.optimizer.step(grads=grads_groups,
-                            output_params=self.fp16_groups,
-                            scale=combined_scale)
+        combined_scale = self.unscale_and_clip_grads(self._global_grad_norm, apply_scale=False)
+        self.optimizer.step(grads=grads_groups, output_params=self.fp16_groups, scale=combined_scale)
 
         for fp32_group, fp16_group in zip(self.fp32_groups, self.fp16_groups):
             for idx, (fp32_param, fp16_param) in enumerate(zip(fp32_group, fp16_group)):
@@ -199,9 +188,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -219,10 +206,8 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
         self._update_scale(self.overflow)
         if self.overflow:
             if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.cur_scale))
+                logger.info("[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
+                            "scale: {}, reducing to {}".format(prev_scale, self.cur_scale))
             return self.overflow
 
         norm_groups = []
@@ -236,9 +221,7 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
             # copying gradients to fp32 to wor  k with fp32 parameters
             for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]):
                 if fp16_param.grad is None:
-                    fp32_param.grad = torch.zeros(fp16_param.size(),
-                                                  dtype=fp32_param.dtype,
-                                                  device=fp32_param.device)
+                    fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device)
                 else:
                     fp32_param.grad = fp16_param.grad.to(fp32_param.dtype)
 
@@ -294,25 +277,19 @@ class FP16_UnfusedOptimizer(DeepSpeedOptimizer):
         if self.dynamic_loss_scale:
             prev_scale = self.cur_scale
             if skip:
-                self.cur_scale = max(self.cur_scale / self.scale_factor,
-                                     self.min_loss_scale)
+                self.cur_scale = max(self.cur_scale / self.scale_factor, self.min_loss_scale)
                 self.last_overflow_iter = self.cur_iter
                 if self.verbose:
                     logger.info("Grad overflow on iteration: %s", self.cur_iter)
-                    logger.info(
-                        f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                    )
+                    logger.info(f"Reducing dynamic loss scale from {prev_scale} to {self.cur_scale}")
             else:
                 # Ensure self.scale_window updates since last overflow
                 stable_interval = (self.cur_iter - self.last_overflow_iter) - 1
                 if (stable_interval > 0) and (stable_interval % self.scale_window == 0):
                     self.cur_scale *= self.scale_factor
                     if self.verbose:
-                        logger.info(
-                            f"No Grad overflow for {self.scale_window} iterations")
-                        logger.info(
-                            f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}"
-                        )
+                        logger.info(f"No Grad overflow for {self.scale_window} iterations")
+                        logger.info(f"Increasing dynamic loss scale from {prev_scale} to {self.cur_scale}")
         else:
             if skip:
                 logger.info("Grad overflow on iteration %s", self.cur_iter)
diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b15447fad412a2568307243039638104cd5aa115
--- /dev/null
+++ b/deepspeed/runtime/hybrid_engine.py
@@ -0,0 +1,426 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.inference.config import DeepSpeedInferenceConfig
+from deepspeed.module_inject.replace_policy import replace_policies
+from deepspeed.module_inject.utils import policy_to_ds_container
+from .engine import DeepSpeedEngine
+from .utils import TLinear, get_inactive_params
+from deepspeed.runtime.zero import GatheredParameters
+import time
+import gc
+import math
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from torch import nn
+from deepspeed.utils import logger
+
+from deepspeed.ops.op_builder import InferenceBuilder
+
+from deepspeed.module_inject.layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+try:
+    import transformers
+    OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
+except:
+    OPTLearnedPositionalEmbedding = None
+inference_cuda_module = None
+
+
+class DeepSpeedHybridEngine(DeepSpeedEngine):
+    r"""DeepSpeed engine for training and inference."""
+    inference_mp_group = None
+
+    def __init__(self, args, model, **kwargs):
+
+        super().__init__(args, model, **kwargs)
+
+        # synch seed between all GPUs
+        _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
+        dist.broadcast(_rng_state, 0)
+        get_accelerator().set_rng_state(_rng_state.cpu())
+
+        self.Z3_enabled = (self._config.zero_config.stage == 3)
+        self.gather_all_layers = self._config.hybrid_engine.pin_parameters
+
+        # inference containers / fwds
+        self._inference_containers = []
+        self._orig_modules = []
+        self._orig_fwds = []
+        self.create_inference_module()
+
+        # Performance stats
+        self._t_start = None
+        self._total_latency = 0
+        self._iters = 0
+        self._training_start_time = None
+        self._generate_latency = 0
+        self._training_latency = 0
+        self._total_batch_size = None
+        self._gather_latency = 0
+
+        global inference_cuda_module
+        if inference_cuda_module is None:
+            builder = InferenceBuilder()
+            inference_cuda_module = builder.load()
+
+        self.is_lora_fused = False
+
+    def convert_to_linear_transposed(self, model):
+
+        def _replace_linear_layer(r_module, parent_type=None, prev_type=None):
+            for name, child in r_module.named_children():
+                if child.__class__ in [torch.nn.Linear] and \
+                    (parent_type is torch.nn.ModuleList or prev_type is torch.nn.ModuleList):
+                    setattr(r_module, name, TLinear(child, name))
+                else:
+                    _replace_linear_layer(child, type(r_module), prev_type=parent_type)
+            return r_module
+
+        _replace_linear_layer(model)
+
+    def new_inference_container(self, orig_layer, policy_cls, layer_id):
+        policy = policy_cls(orig_layer, inference=True)
+        _container = policy_to_ds_container(
+            policy=policy,
+            config=DeepSpeedInferenceConfig(set_empty_params=True,
+                                            max_out_tokens=self._config.hybrid_engine.max_out_tokens,
+                                            min_out_tokens=self._config.hybrid_engine.max_out_tokens,
+                                            transposed_mode=True),
+            model_config=self.module.config if hasattr(self.module, 'config') else None,
+            layer_id=layer_id,
+            child=orig_layer)
+        _container.set_dtype(self._config.fp16_enabled)
+
+        if self.mpu is not None:
+            if hasattr(self.mpu, 'get_model_parallel_world_size'):
+                _container.set_tensor_parallel_config(self.mpu.get_model_parallel_world_size(),
+                                                      self.mpu.get_model_parallel_group())
+            else:
+                _container.set_tensor_parallel_config(self.mpu.get_tensor_model_parallel_world_size(),
+                                                      self.mpu.get_tensor_model_parallel_group())
+        else:
+            _container.set_tensor_parallel_config(self._config.hybrid_engine.inference_tp_size, self.mp_group)
+        _container.initialize_tensors(enable_training=True)
+        _container.create_ds_model_config()
+        _container.create_module()
+        _container.set_params_wo_copy(Z3_enabled=self.Z3_enabled)
+        return _container
+
+    def populate_all_inference_policies(self):
+        self.inference_policies = {}
+        for plcy in replace_policies:
+            _ = plcy(None)
+            if isinstance(plcy._orig_layer_class, list):
+                for orig_layer_class in plcy._orig_layer_class:
+                    self.inference_policies.update({orig_layer_class: (self.new_inference_container, plcy)})
+            elif plcy._orig_layer_class is not None:
+                self.inference_policies.update({plcy._orig_layer_class: (self.new_inference_container, plcy)})
+        self.inference_policies.update({
+            nn.Linear: (LinearLayer, ),
+            nn.Embedding: (EmbeddingLayer, ),
+            nn.LayerNorm: (Normalize, ),
+            OPTLearnedPositionalEmbedding: (OPTEmbedding, )
+        })
+
+    def _fuse_lora(self, params, lora_params):
+        maybe_has_lora_params = [p for p in params if len(p.shape) > 1]
+        for lora_param, weight in zip(lora_params, maybe_has_lora_params):
+            if len(lora_param) == 3:
+                lora_right_weight, \
+                lora_left_weight, \
+                lora_scaling = lora_param
+                weight.data += lora_scaling * torch.matmul(lora_left_weight.t(), lora_right_weight.t())
+
+    def fuse_lora_weight(self):
+        for layer_id in range(len(self.layer_params)):
+            self._fuse_lora(self.layer_params[layer_id], self.lora_params[layer_id])
+
+    def _unfuse_lora(self, params, lora_params):
+        maybe_has_lora_params = [p for p in params if len(p.shape) > 1]
+        for lora_param, weight in zip(lora_params, maybe_has_lora_params):
+            if len(lora_param) == 3:
+                lora_right_weight, \
+                lora_left_weight, \
+                lora_scaling = lora_param
+                weight.data -= lora_scaling * torch.matmul(lora_left_weight.t(), lora_right_weight.t())
+
+    def unfuse_lora_weight(self):
+        for layer_id in range(len(self.layer_params)):
+            self._unfuse_lora(self.layer_params[layer_id], self.lora_params[layer_id])
+
+    def unfuse_lora_weight_non_pinned(self):
+        for layer_id in range(len(self.layer_params)):
+            non_active_params = get_inactive_params(self.layer_params[layer_id])
+            non_active_lora_params = get_inactive_params(self.layer_lora_params[layer_id])
+            non_active_params.extend(non_active_lora_params)
+
+            with GatheredParameters(non_active_params):
+                self._unfuse_lora(self.layer_params[layer_id], self.lora_params[layer_id])
+
+    def retake_inference_cache(self):
+        if self._config.hybrid_engine.release_inference_cache:
+            retake_success = inference_cuda_module.retake_workspace()
+
+            if not retake_success:
+                logger.warning("Unable to acquire workspace on first attempt, emtpying cache and retrying.")
+                gc.collect()
+                get_accelerator().empty_cache()
+                retake_success = inference_cuda_module.retake_workspace()
+
+                if not retake_success:
+                    raise RuntimeError("Unable to retake inference workspace.")
+
+    def generate(self, *inputs, **kwargs):
+        if self._total_batch_size is None:
+            bsz = inputs[0].shape[0] if len(inputs) > 0 else \
+                kwargs['input_ids'].shape[0]
+            self._total_batch_size = bsz * dist.get_world_size()
+
+        self._t0 = time.time()
+
+        if self.Z3_enabled and self.gather_all_layers:
+            if self._config.hybrid_engine.inference_tp_size > 1:
+                non_tp_params = []
+                for other_layer in self._other_layers:
+                    non_tp_params.extend(list(other_layer.parameters()))
+
+                partition_size = self._config.hybrid_engine.tp_gather_partition_size
+
+                layer_groups = math.ceil(len(self.layer_params) / partition_size)
+                for lg in range(layer_groups):
+                    non_active_params = []
+                    non_active_lora_params = []
+                    for layer_id in range(lg * partition_size, min(len(self.layer_params), (lg + 1) * partition_size),
+                                          1):
+                        non_tp_params.extend(self.layer_params[layer_id][:4])
+                        non_active_params.extend(get_inactive_params(self.layer_params[layer_id]))
+                        non_active_params.extend(get_inactive_params(self.layer_lora_params[layer_id]))
+                    with GatheredParameters(non_active_params):
+                        for layer_id in range(lg * partition_size,
+                                              min(len(self.layer_params), (lg + 1) * partition_size), 1):
+                            if len(self.all_lora_params) > 0:
+                                self._fuse_lora(self.layer_params[layer_id], self.lora_params[layer_id])
+
+                            if self.mpu is not None:
+                                self._inference_containers[layer_id].apply_tensor_parallelism(
+                                    mp_group=self.mp_group, tp_size=self._config.hybrid_engine.inference_tp_size)
+
+                # TODO(cmikeh2) Evaluate if this can be deferred when release_inference_cache
+                # is enabled.
+                gc.collect()
+                get_accelerator().empty_cache()
+
+                self._gather_latency = time.time() - self._t0
+
+                input_shape = inputs[0].shape if len(inputs) > 0 else \
+                                kwargs['input_ids'].shape
+                output = torch.zeros(
+                    (input_shape[0] * self._config.hybrid_engine.inference_tp_size, ) + input_shape[1:],
+                    dtype=inputs[0].dtype if len(inputs) > 0 else kwargs['input_ids'].dtype,
+                    device=inputs[0].device if len(inputs) > 0 else kwargs['input_ids'].device)
+                input_cont = inputs[0].contiguous() if len(inputs) > 0 else kwargs['input_ids'].contiguous()
+                dist.all_gather_into_tensor(output, input_cont, group=self.mp_group)
+
+                if len(inputs) > 0:
+                    inputs = (output, )
+                else:
+                    kwargs['input_ids'] = output
+
+                self.retake_inference_cache()
+
+                non_active_params = get_inactive_params(non_tp_params)
+                with GatheredParameters(non_active_params):
+                    generate_ret_vals = self._generate(*inputs, **kwargs)
+
+                for layer_id in range(len(self.layer_params)):
+                    self._inference_containers[layer_id].release_memory()
+
+                rank = dist.get_rank(group=self.mp_group)
+                generate_ret_vals = generate_ret_vals[input_shape[0] * rank:input_shape[0] * (rank + 1)]
+
+            else:
+                non_active_layers = get_inactive_params(self.all_layers_params)
+                non_active_lora_params = get_inactive_params(self.all_lora_params)
+                non_active_layers.extend(non_active_lora_params)
+                with GatheredParameters(non_active_layers):
+                    self._gather_latency = time.time() - self._t0
+
+                    if len(self.all_lora_params) > 0:
+                        self.fuse_lora_weight()
+
+                    self.retake_inference_cache()
+                    generate_ret_vals = self._generate(*inputs, **kwargs)
+
+                    if len(self.all_lora_params) > 0:
+                        self.unfuse_lora_weight()
+        else:
+            if len(self.all_lora_params) > 0 and (not self.Z3_enabled):
+                self.fuse_lora_weight()
+
+            self.retake_inference_cache()
+            generate_ret_vals = self._generate(*inputs, **kwargs)
+
+            if len(self.all_lora_params) > 0:
+                if (not self.Z3_enabled):
+                    self.unfuse_lora_weight()
+                else:
+                    self.unfuse_lora_weight_non_pinned()
+                self.is_lora_fused = False
+
+        if self._config.hybrid_engine.release_inference_cache:
+            inference_cuda_module.release_workspace()
+            gc.collect()
+            get_accelerator().empty_cache()
+
+        self._generate_latency = time.time() - self._t0 - self._gather_latency
+
+        return generate_ret_vals
+
+    def create_inference_containers(self, module, layer_id=0):
+        for name, child in module.named_children():
+            if child.__class__ in self.inference_policies:
+                if self.inference_policies[child.__class__][0] == self.new_inference_container:
+                    self._inference_containers.append(self.inference_policies[child.__class__][0](
+                        child, self.inference_policies[child.__class__][-1], layer_id))
+                    self._orig_modules.append(child)
+                    self._orig_fwds.append(child.forward)
+
+                    self.layer_params.append(self._inference_containers[layer_id].get_all_params())
+
+                    self.lora_params.append(self._inference_containers[layer_id].get_lora_params())
+                    self.layer_lora_params.append([])
+                    for lora_param in self.lora_params[layer_id]:
+                        self.layer_lora_params[layer_id].extend(lora_param[:-1])
+                        self.all_lora_params.extend(lora_param[:-1])
+
+                    layer_id += 1
+                else:
+                    self._other_layers.append(self.inference_policies[child.__class__][0](
+                        weight=child.weight, bias=child.bias if hasattr(child, 'bias') else None))
+                    self._orig_modules_others.append(child)
+                    self._orig_fwds_others.append(child.forward)
+            else:
+                self.create_inference_containers(child, layer_id=layer_id)
+
+    def create_inference_module(self):
+        self.layer_params = []
+        self.layer_lora_params = []
+        self.lora_params = []
+        self.all_lora_params = []
+
+        self._other_layers = []
+        self._orig_modules_others = []
+        self._orig_fwds_others = []
+
+        if self._config.hybrid_engine.inference_tp_size > 1:
+            if self.mpu is not None:
+                global_rank = dist.get_rank()
+                world_size = dist.get_world_size()
+                mp_group_id = global_rank // self._config.hybrid_engine.inference_tp_size
+                num_mp_groups = world_size // self._config.hybrid_engine.inference_tp_size
+                for mp_group_id in range(num_mp_groups):
+                    ranks = list(
+                        range(mp_group_id * self._config.hybrid_engine.inference_tp_size, \
+                            (mp_group_id + 1) * self._config.hybrid_engine.inference_tp_size, \
+                            1)
+                    )
+                    mp_group = dist.new_group(ranks)
+                    if global_rank in ranks:
+                        self.mp_group = mp_group
+            else:
+                self.mp_group = self.mpu.get_model_parallel_group() if hasattr(self.mpu, 'get_model_parallel_group') else \
+                    self.mpu.get_tensor_model_parallel_group()
+        else:
+            self.mp_group = None
+        self.populate_all_inference_policies()
+        self.all_layers_params = list(self.module.parameters())
+        self.create_inference_containers(self.module)
+
+        if len(self._inference_containers) > 0:
+            self._generate = self.module.generate
+            self.module.generate = self.generate
+
+        self._t0 = time.time()
+
+    def _zero3_forward(self, layer_id):
+
+        def run_forward(*inputs, **kwargs):
+            non_active_params = get_inactive_params(self.layer_params[layer_id])
+            non_active_lora_params = get_inactive_params(self.layer_lora_params[layer_id])
+            non_active_params.extend(non_active_lora_params)
+
+            with GatheredParameters(non_active_params):
+                if len(self.all_lora_params) > 0:
+                    # Use the is_lora_fused flag to prevent multiple fusion in Z3 with non-pinned memory
+                    if not self.is_lora_fused:
+                        self._fuse_lora(self.layer_params[layer_id], self.lora_params[layer_id])
+                    # Set the is_lora_fused to true when reaching the last layer
+                    if layer_id == len(self.layer_params) - 1:
+                        self.is_lora_fused = True
+                return self._inference_containers[layer_id].module.forward(*inputs, **kwargs)
+
+        return run_forward
+
+    def eval(self):
+        if self._t_start is not None:
+            latency = time.time() - self._t_start
+            self._total_latency = self._total_latency + latency
+            self._iters = self._iters + 1
+            if not dist.is_initialized() or dist.get_rank() == 0:
+                others = latency - (self._generate_latency + self._training_latency)
+                print(f'|E2E latency={(latency):.2f}s ' + \
+                      f'|Gather latency={self._gather_latency:.2f}s ({(self._gather_latency / latency * 100):.2f}%) '
+                      f'|Generate time={(self._generate_latency):.2f}s ({(self._generate_latency / latency * 100):.2f}%) ' + \
+                      f'|Training time={(self._training_latency):.2f}s ({(self._training_latency / latency * 100):.2f}%) ' + \
+                      f'|Others={others:.2f} ({(others / latency * 100):.2f}%)'
+                      f'|CurSamplesPerSec={(1 / latency * self._total_batch_size):.2f} ' + \
+                      f'|AvgSamplesPerSec={(1 / (self._total_latency / self._iters) * self._total_batch_size):.2f}')
+            self._t_start = time.time()
+        self._training_latency = 0
+        super().eval()
+        if len(self._inference_containers) > 0:
+            for i, (orig_module, inference_container) in enumerate(zip(self._orig_modules,
+                                                                       self._inference_containers)):
+                if self.Z3_enabled and not self.gather_all_layers:
+                    orig_module.forward = self._zero3_forward(i)
+                else:
+                    orig_module.forward = inference_container.module.forward
+
+                inference_container.align_merged_qkv()
+
+            if not self.Z3_enabled or self.gather_all_layers:
+                for orig_module, inference_layer in zip(self._orig_modules_others, self._other_layers):
+                    orig_module.forward = inference_layer.forward
+        if self.Z3_enabled:
+            gc.collect()
+            get_accelerator().empty_cache()
+        if self._t_start is None:
+            self._t_start = time.time()
+
+    def train(self, mode=True):
+        if mode and len(self._orig_modules) > 0:
+            for inference_container, orig_module, orig_fwd in zip(self._inference_containers, self._orig_modules,
+                                                                  self._orig_fwds):
+                inference_container.partition_merged_qkv()
+                orig_module.forward = orig_fwd
+            for orig_module, orig_fwd in zip(self._orig_modules_others, self._orig_fwds_others):
+                orig_module.forward = orig_fwd
+        super().train(mode)
+        if mode:
+            self._training_start_time = time.time()
+
+    def step(self, lr_kwargs=None):
+        super().step(lr_kwargs=lr_kwargs)
+
+        if len(self._inference_containers) > 0:
+            if(self._inference_containers[0].module.attention.attn_qkvw is not None and \
+                self._inference_containers[0].q_k_v is not None):
+                for inference_container in self._inference_containers:
+                    inference_container.reset_qkv()
+
+        if self._training_start_time is not None:
+            self._training_latency += (time.time() - self._training_start_time)
+            self._training_start_time = time.time()
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index faf5e6fee910514eefe33a5668ee0f2aa65b28e2..d2bd93d8ee31563f32e84615758e5b7a3f827c91 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -1,11 +1,12 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Implementation of learning rate schedules.
 
 Taken and modified from PyTorch v1.0.1 source
 https://github.com/pytorch/pytorch/blob/v1.1.0/torch/optim/lr_scheduler.py
-
 """
 
 import argparse
@@ -53,28 +54,15 @@ TOTAL_NUM_STEPS = 'total_num_steps'
 
 
 def add_tuning_arguments(parser):
-    group = parser.add_argument_group('Convergence Tuning',
-                                      'Convergence tuning configurations')
+    group = parser.add_argument_group('Convergence Tuning', 'Convergence tuning configurations')
 
     # LR scheduler
-    group.add_argument('--lr_schedule',
-                       type=str,
-                       default=None,
-                       help='LR schedule for training.')
+    group.add_argument('--lr_schedule', type=str, default=None, help='LR schedule for training.')
 
     # Learning rate range test
-    group.add_argument("--lr_range_test_min_lr",
-                       type=float,
-                       default=0.001,
-                       help='Starting lr value.')
-    group.add_argument("--lr_range_test_step_rate",
-                       type=float,
-                       default=1.0,
-                       help='scaling rate for LR range test.')
-    group.add_argument("--lr_range_test_step_size",
-                       type=int,
-                       default=1000,
-                       help='training steps per LR change.')
+    group.add_argument("--lr_range_test_min_lr", type=float, default=0.001, help='Starting lr value.')
+    group.add_argument("--lr_range_test_step_rate", type=float, default=1.0, help='scaling rate for LR range test.')
+    group.add_argument("--lr_range_test_step_size", type=int, default=1000, help='training steps per LR change.')
     group.add_argument("--lr_range_test_staircase",
                        type=bool,
                        default=False,
@@ -89,66 +77,34 @@ def add_tuning_arguments(parser):
                        type=int,
                        default=-1,
                        help='first stair count for 1Cycle schedule.')
-    group.add_argument(
-        "--cycle_second_step_size",
-        type=int,
-        default=-1,
-        help='size of second step of 1Cycle schedule (default first_step_size).')
+    group.add_argument("--cycle_second_step_size",
+                       type=int,
+                       default=-1,
+                       help='size of second step of 1Cycle schedule (default first_step_size).')
     group.add_argument("--cycle_second_stair_count",
                        type=int,
                        default=-1,
                        help='second stair count for 1Cycle schedule.')
-    group.add_argument(
-        "--decay_step_size",
-        type=int,
-        default=1000,
-        help='size of intervals for applying post cycle decay (training steps).')
+    group.add_argument("--decay_step_size",
+                       type=int,
+                       default=1000,
+                       help='size of intervals for applying post cycle decay (training steps).')
 
     # 1Cycle LR
-    group.add_argument("--cycle_min_lr",
-                       type=float,
-                       default=0.01,
-                       help='1Cycle LR lower bound.')
-    group.add_argument("--cycle_max_lr",
-                       type=float,
-                       default=0.1,
-                       help='1Cycle LR upper bound.')
-    group.add_argument("--decay_lr_rate",
-                       type=float,
-                       default=0.0,
-                       help='post cycle LR decay rate.')
+    group.add_argument("--cycle_min_lr", type=float, default=0.01, help='1Cycle LR lower bound.')
+    group.add_argument("--cycle_max_lr", type=float, default=0.1, help='1Cycle LR upper bound.')
+    group.add_argument("--decay_lr_rate", type=float, default=0.0, help='post cycle LR decay rate.')
 
     # 1Cycle Momentum
-    group.add_argument('--cycle_momentum',
-                       default=False,
-                       action='store_true',
-                       help='Enable 1Cycle momentum schedule.')
-    group.add_argument("--cycle_min_mom",
-                       type=float,
-                       default=0.8,
-                       help='1Cycle momentum lower bound.')
-    group.add_argument("--cycle_max_mom",
-                       type=float,
-                       default=0.9,
-                       help='1Cycle momentum upper bound.')
-    group.add_argument("--decay_mom_rate",
-                       type=float,
-                       default=0.0,
-                       help='post cycle momentum decay rate.')
+    group.add_argument('--cycle_momentum', default=False, action='store_true', help='Enable 1Cycle momentum schedule.')
+    group.add_argument("--cycle_min_mom", type=float, default=0.8, help='1Cycle momentum lower bound.')
+    group.add_argument("--cycle_max_mom", type=float, default=0.9, help='1Cycle momentum upper bound.')
+    group.add_argument("--decay_mom_rate", type=float, default=0.0, help='post cycle momentum decay rate.')
 
     # Warmup LR
-    group.add_argument('--warmup_min_lr',
-                       type=float,
-                       default=0,
-                       help='WarmupLR minimum/initial LR value')
-    group.add_argument('--warmup_max_lr',
-                       type=float,
-                       default=0.001,
-                       help='WarmupLR maximum LR value.')
-    group.add_argument('--warmup_num_steps',
-                       type=int,
-                       default=1000,
-                       help='WarmupLR step count for LR warmup.')
+    group.add_argument('--warmup_min_lr', type=float, default=0, help='WarmupLR minimum/initial LR value')
+    group.add_argument('--warmup_max_lr', type=float, default=0.001, help='WarmupLR maximum LR value.')
+    group.add_argument('--warmup_num_steps', type=int, default=1000, help='WarmupLR step count for LR warmup.')
     group.add_argument('--warmup_type',
                        type=str,
                        default=WARMUP_LOG_RATE,
@@ -168,16 +124,13 @@ def override_lr_range_test_params(args, params):
     if hasattr(args, LR_RANGE_TEST_MIN_LR) and args.lr_range_test_min_lr is not None:
         params[LR_RANGE_TEST_MIN_LR] = args.lr_range_test_min_lr
 
-    if hasattr(args,
-               LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None:
+    if hasattr(args, LR_RANGE_TEST_STEP_RATE) and args.lr_range_test_step_rate is not None:
         params[LR_RANGE_TEST_STEP_RATE] = args.lr_range_test_step_rate
 
-    if hasattr(args,
-               LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None:
+    if hasattr(args, LR_RANGE_TEST_STEP_SIZE) and args.lr_range_test_step_size is not None:
         params[LR_RANGE_TEST_STEP_SIZE] = args.lr_range_test_step_size
 
-    if hasattr(args,
-               LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None:
+    if hasattr(args, LR_RANGE_TEST_STAIRCASE) and args.lr_range_test_staircase is not None:
         params[LR_RANGE_TEST_STAIRCASE] = args.lr_range_test_staircase
 
 
@@ -185,15 +138,13 @@ def override_1cycle_params(args, params):
     if hasattr(args, CYCLE_FIRST_STEP_SIZE) and args.cycle_first_step_size is not None:
         params[CYCLE_FIRST_STEP_SIZE] = args.cycle_first_step_size
 
-    if hasattr(args,
-               CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None:
+    if hasattr(args, CYCLE_FIRST_STAIR_COUNT) and args.cycle_first_stair_count is not None:
         params[CYCLE_FIRST_STAIR_COUNT] = args.cycle_first_stair_count
 
     if hasattr(args, CYCLE_SECOND_STEP_SIZE) and args.cycle_second_step_size is not None:
         params[CYCLE_SECOND_STEP_SIZE] = args.cycle_second_step_size
 
-    if hasattr(args,
-               CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None:
+    if hasattr(args, CYCLE_SECOND_STAIR_COUNT) and args.cycle_second_stair_count is not None:
         params[CYCLE_SECOND_STAIR_COUNT] = args.cycle_second_stair_count
 
     if hasattr(args, DECAY_STEP_SIZE) and args.decay_step_size is not None:
@@ -301,8 +252,7 @@ def get_torch_optimizer(optimizer):
     if hasattr(optimizer, 'optimizer') and isinstance(optimizer.optimizer, Optimizer):
         return optimizer.optimizer
 
-    raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format(
-        type(optimizer).__name__))
+    raise TypeError('{} is not a subclass of torch.optim.Optimizer'.format(type(optimizer).__name__))
 
 
 class LRRangeTest(object):
@@ -343,6 +293,7 @@ class LRRangeTest(object):
         _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay:
         https://arxiv.org/abs/1803.09820
 """
+
     def __init__(self,
                  optimizer: Optimizer,
                  lr_range_test_min_lr: float = 1e-3,
@@ -353,13 +304,10 @@ class LRRangeTest(object):
 
         self.optimizer = get_torch_optimizer(optimizer)
 
-        if isinstance(lr_range_test_min_lr,
-                      list) or isinstance(lr_range_test_min_lr,
-                                          tuple):
+        if isinstance(lr_range_test_min_lr, list) or isinstance(lr_range_test_min_lr, tuple):
             if len(lr_range_test_min_lr) != len(self.optimizer.param_groups):
-                raise ValueError("expected {} lr_range_test_min_lr, got {}".format(
-                    len(self.optimizer.param_groups),
-                    len(lr_range_test_min_lr)))
+                raise ValueError("expected {} lr_range_test_min_lr, got {}".format(len(self.optimizer.param_groups),
+                                                                                   len(lr_range_test_min_lr)))
             self.min_lr = list(lr_range_test_min_lr)
         else:
             self.min_lr = [lr_range_test_min_lr] * len(self.optimizer.param_groups)
@@ -384,9 +332,7 @@ class LRRangeTest(object):
 
     def get_lr(self):
         lr_increase = self._get_increase()
-        return [
-            lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr
-        ]
+        return [lr_range_test_min_lr * lr_increase for lr_range_test_min_lr in self.min_lr]
 
     def get_last_lr(self):
         """ Return last computed learning rate by current scheduler.
@@ -480,6 +426,7 @@ class OneCycle(object):
 
     .. _A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay: https://arxiv.org/abs/1803.09820
     """
+
     def __init__(self,
                  optimizer,
                  cycle_min_lr,
@@ -499,26 +446,16 @@ class OneCycle(object):
         self.optimizer = get_torch_optimizer(optimizer)
 
         # Initialize cycle shape
-        self._initialize_cycle(cycle_first_step_size,
-                               cycle_second_step_size,
-                               cycle_first_stair_count,
-                               cycle_second_stair_count,
-                               decay_step_size)
+        self._initialize_cycle(cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count,
+                               cycle_second_stair_count, decay_step_size)
 
         # Initialize cycle lr
-        self._initialize_lr(self.optimizer,
-                            cycle_min_lr,
-                            cycle_max_lr,
-                            decay_lr_rate,
-                            last_batch_iteration)
+        self._initialize_lr(self.optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration)
 
         # Initialize cyclic momentum
         self.cycle_momentum = cycle_momentum
         if cycle_momentum:
-            self._initialize_momentum(self.optimizer,
-                                      cycle_min_mom,
-                                      cycle_max_mom,
-                                      decay_mom_rate,
+            self._initialize_momentum(self.optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate,
                                       last_batch_iteration)
 
         # Initialize batch iteration tracker
@@ -526,16 +463,11 @@ class OneCycle(object):
 
     # Configure cycle shape
 
-    def _initialize_cycle(self,
-                          cycle_first_step_size,
-                          cycle_second_step_size,
-                          cycle_first_stair_count,
-                          cycle_second_stair_count,
-                          decay_step_size):
+    def _initialize_cycle(self, cycle_first_step_size, cycle_second_step_size, cycle_first_stair_count,
+                          cycle_second_stair_count, decay_step_size):
         cycle_first_step_size = float(cycle_first_step_size)
         cycle_second_step_size = float(
-            cycle_second_step_size
-        ) if cycle_second_step_size is not None else cycle_first_step_size
+            cycle_second_step_size) if cycle_second_step_size is not None else cycle_first_step_size
 
         self.total_size = cycle_first_step_size + cycle_second_step_size
         self.step_ratio = cycle_first_step_size / self.total_size
@@ -551,12 +483,7 @@ class OneCycle(object):
             self.skip_mom_decay = False
 
     # Configure lr schedule
-    def _initialize_lr(self,
-                       optimizer,
-                       cycle_min_lr,
-                       cycle_max_lr,
-                       decay_lr_rate,
-                       last_batch_iteration):
+    def _initialize_lr(self, optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, last_batch_iteration):
         self.min_lrs = [cycle_min_lr] * len(optimizer.param_groups)
         if last_batch_iteration == -1:
             for lr, group in zip(self.min_lrs, optimizer.param_groups):
@@ -569,12 +496,7 @@ class OneCycle(object):
             self.skip_lr_decay = True
 
     # Configure momentum schedule
-    def _initialize_momentum(self,
-                             optimizer,
-                             cycle_min_mom,
-                             cycle_max_mom,
-                             decay_mom_rate,
-                             last_batch_iteration):
+    def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration):
         if 'betas' not in optimizer.defaults:
             optimizer_name = type(optimizer).__name__
             logger.warn(
@@ -722,6 +644,7 @@ class WarmupLR(object):
             >>>         scheduler.step()
 
     """
+
     def __init__(self,
                  optimizer: Optimizer,
                  warmup_min_lr: float = 0.0,
@@ -738,9 +661,8 @@ class WarmupLR(object):
         self.warmup_num_steps = max(2, warmup_num_steps)
         # Currently only support linear and log function
         if warmup_type not in {WARMUP_LOG_RATE, WARMUP_LINEAR_RATE}:
-            logger.warning(
-                f"Using unknown warmup_type: {warmup_type}. The increasing function "
-                f"is set to default (log)")
+            logger.warning(f"Using unknown warmup_type: {warmup_type}. The increasing function "
+                           f"is set to default (log)")
             warmup_type = WARMUP_LOG_RATE
         self.warmup_type = warmup_type
         self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
@@ -748,15 +670,10 @@ class WarmupLR(object):
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
-            logger.warning(
-                "Attempting to get learning rate from scheduler before it has started")
+            logger.warning("Attempting to get learning rate from scheduler before it has started")
             return [0.0]
         gamma = self._get_gamma()
-        return [
-            min_lr + (delta_lr * gamma) for min_lr,
-            delta_lr in zip(self.min_lrs,
-                            self.delta_lrs)
-        ]
+        return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)]
 
     def get_last_lr(self):
         """ Return last computed learning rate by current scheduler.
@@ -789,10 +706,8 @@ class WarmupLR(object):
     def _format_param(self, optimizer, param_value, param_name):
         if isinstance(param_value, list) or isinstance(param_value, tuple):
             if len(param_value) != len(optimizer.param_groups):
-                raise ValueError("expected {} value for {}, got {}".format(
-                    len(optimizer.param_groups),
-                    param_name,
-                    FileNotFoundError(param_value)))
+                raise ValueError("expected {} value for {}, got {}".format(len(optimizer.param_groups), param_name,
+                                                                           FileNotFoundError(param_value)))
             return list(param_value)
         return [param_value] * len(optimizer.param_groups)
 
@@ -819,6 +734,7 @@ class WarmupDecayLR(WarmupLR):
             >>>         scheduler.step()
 
     """
+
     def __init__(self,
                  optimizer: Optimizer,
                  total_num_steps: int,
@@ -829,17 +745,11 @@ class WarmupDecayLR(WarmupLR):
                  last_batch_iteration: int = -1):
 
         self.total_num_steps = total_num_steps
-        super(WarmupDecayLR,
-              self).__init__(optimizer,
-                             warmup_min_lr,
-                             warmup_max_lr,
-                             warmup_num_steps,
-                             warmup_type,
-                             last_batch_iteration)
+        super(WarmupDecayLR, self).__init__(optimizer, warmup_min_lr, warmup_max_lr, warmup_num_steps, warmup_type,
+                                            last_batch_iteration)
         if self.total_num_steps < self.warmup_num_steps:
             logger.warning('total_num_steps {} is less than warmup_num_steps {}'.format(
-                total_num_steps,
-                warmup_num_steps))
+                total_num_steps, warmup_num_steps))
 
     def _get_gamma(self):
         if self.last_batch_iteration < self.warmup_num_steps:
@@ -850,5 +760,4 @@ class WarmupDecayLR(WarmupLR):
         return max(
             0.0,
             float(self.total_num_steps - self.last_batch_iteration) /
-            float(max(1.0,
-                      self.total_num_steps - self.warmup_num_steps)))
+            float(max(1.0, self.total_num_steps - self.warmup_num_steps)))
diff --git a/deepspeed/runtime/pipe/__init__.py b/deepspeed/runtime/pipe/__init__.py
index dcffaa6101b44ece10ad6e2d5a4941b95eb7ae5d..cf58eb1801af7cc02f3cc2057a5c843cd078b575 100644
--- a/deepspeed/runtime/pipe/__init__.py
+++ b/deepspeed/runtime/pipe/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .module import PipelineModule, LayerSpec, TiedLayerSpec
 from .topology import ProcessTopology
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index a080559b1a2aa065bc5916e32659b4e4a8242929..367932a95d28d5e89d624661eeeb3b55ac5a14d0 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -1,4 +1,7 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import MethodType
 
@@ -12,6 +15,8 @@ from deepspeed.accelerator import get_accelerator
 from ..engine import DeepSpeedEngine, MEMORY_OPT_ALLREDUCE_SIZE
 from ..utils import PartitionedTensor
 from ..dataloader import RepeatingLoader
+from ..zero.config import ZeroStageEnum
+from ..activation_checkpointing import checkpointing as ds_checkpointing
 
 from .module import PipelineModule, PipelineError
 from . import p2p
@@ -41,18 +46,8 @@ class PipelineEngine(DeepSpeedEngine):
     is provided.
     """
     ID_TO_DTYPE = [
-        torch.float32,
-        torch.float64,
-        torch.complex64,
-        torch.complex128,
-        torch.float16,
-        torch.bfloat16,
-        torch.uint8,
-        torch.int8,
-        torch.int16,
-        torch.int32,
-        torch.int64,
-        torch.bool
+        torch.float32, torch.float64, torch.complex64, torch.complex128, torch.float16, torch.bfloat16, torch.uint8,
+        torch.int8, torch.int16, torch.int32, torch.int64, torch.bool
     ]
     DTYPE_TO_ID = {dtype: id_ for id_, dtype in enumerate(ID_TO_DTYPE)}
 
@@ -134,8 +129,7 @@ class PipelineEngine(DeepSpeedEngine):
                 if self.global_rank != min(d['ranks']):
                     tied_params += sum(p.numel() for p in d['module'].parameters())
             unique_params -= tied_params
-        params_tensor = torch.LongTensor(data=[num_params,
-                                               unique_params]).to(self.device)
+        params_tensor = torch.LongTensor(data=[num_params, unique_params]).to(self.device)
         dist.all_reduce(params_tensor, group=self.grid.get_model_parallel_group())
         params_tensor = params_tensor.tolist()
         total_params = params_tensor[0]
@@ -156,10 +150,10 @@ class PipelineEngine(DeepSpeedEngine):
         # Pipeline buffers
         self.num_pipe_buffers = 0
         self.pipe_buffers = {
-            'inputs' : [],   # batch input and received activations
-            'labels' : [],   # labels from batch input
-            'outputs' : [],  # activations
-            'output_tensors' : [], # tensor object to preserve backward graph
+            'inputs': [],  # batch input and received activations
+            'labels': [],  # labels from batch input
+            'outputs': [],  # activations
+            'output_tensors': [],  # tensor object to preserve backward graph
         }
         self.pipe_recv_buf = None
         self.grad_layer = None
@@ -178,8 +172,7 @@ class PipelineEngine(DeepSpeedEngine):
         self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
 
         if self._config.pipeline['activation_checkpoint_interval'] > 0:
-            self.module.activation_checkpoint_interval = self._config.pipeline[
-                'activation_checkpoint_interval']
+            self.module.activation_checkpoint_interval = self._config.pipeline['activation_checkpoint_interval']
 
         self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline
 
@@ -220,11 +213,10 @@ class PipelineEngine(DeepSpeedEngine):
         self.has_attention_mask = value
 
     def _build_data_iter(self, dataset):
-        sampler = torch.utils.data.distributed.DistributedSampler(
-            dataset,
-            num_replicas=self.dp_world_size,
-            rank=self.mpu.get_data_parallel_rank(),
-            shuffle=False)
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset,
+                                                                  num_replicas=self.dp_world_size,
+                                                                  rank=self.mpu.get_data_parallel_rank(),
+                                                                  shuffle=False)
         # Build a loader and make it repeating.
         pipe_dataloader = self.deepspeed_io(dataset, data_sampler=sampler)
         pipe_dataloader = RepeatingLoader(pipe_dataloader)
@@ -251,11 +243,10 @@ class PipelineEngine(DeepSpeedEngine):
         self._force_grad_boundary = True
         if self.pipeline_enable_backward_allreduce:
             if self.bfloat16_enabled():
-                if self.zero_optimization_stage() == 0:
+                if self.zero_optimization_stage() < ZeroStageEnum().gradients:
                     self._bf16_reduce_grads()
                 else:
-                    assert self.zero_optimization_stage() == 1, "only bf16 + z1 are supported"
-                    raise NotImplementedError()
+                    raise NotImplementedError("PP+BF16 only work for ZeRO Stage 1")
             else:
                 self.allreduce_gradients(bucket_size=MEMORY_OPT_ALLREDUCE_SIZE)
         self._force_grad_boundary = False
@@ -317,8 +308,7 @@ class PipelineEngine(DeepSpeedEngine):
             The arithmetic mean of the losses computed this batch.
         """
         if not torch._C.is_grad_enabled():
-            raise RuntimeError(
-                f'train_batch() requires gradients enabled. Use eval_batch() instead.')
+            raise RuntimeError(f'train_batch() requires gradients enabled. Use eval_batch() instead.')
 
         # Curriculum learning could change activation shape
         if self.curriculum_enabled_legacy():
@@ -360,28 +350,17 @@ class PipelineEngine(DeepSpeedEngine):
 
         # Monitoring
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/train_loss',
-                                    self.agg_train_loss.mean().item(),
+            self.summary_events = [(f'Train/Samples/train_loss', self.agg_train_loss.mean().item(),
                                     self.global_samples)]
             self.monitor.write_events(self.summary_events)
 
-        if self.wall_clock_breakdown(
-        ) and self.global_steps % self.steps_per_print() == 0:
-            self.timers.log([
-                'pipe_send_output',
-                'pipe_send_grad',
-                'pipe_recv_input',
-                'pipe_recv_grad'
-            ])
+        if self.wall_clock_breakdown() and self.global_steps % self.steps_per_print() == 0:
+            self.timers.log(['pipe_send_output', 'pipe_send_grad', 'pipe_recv_input', 'pipe_recv_grad'])
 
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self,
-                   data_iter,
-                   return_logits=False,
-                   compute_loss=True,
-                   reduce_output='avg'):
+    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg'):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -448,9 +427,7 @@ class PipelineEngine(DeepSpeedEngine):
             eval_output = self._bcast_pipe_scalar(eval_output)
 
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/eval_loss',
-                                    eval_output.mean().item(),
-                                    self.global_samples)]
+            self.summary_events = [(f'Train/Samples/eval_loss', eval_output.mean().item(), self.global_samples)]
             self.monitor.write_events(self.summary_events)
 
         # Restore the training iterator
@@ -510,8 +487,7 @@ class PipelineEngine(DeepSpeedEngine):
                     reduced /= self.dp_world_size
                 else:
                     for idx in range(len(reduced)):
-                        dist.all_reduce(reduced[idx],
-                                        group=self.mpu.get_data_parallel_group())
+                        dist.all_reduce(reduced[idx], group=self.mpu.get_data_parallel_group())
                         reduced[idx] /= self.dp_world_size
 
             return reduced
@@ -525,13 +501,11 @@ class PipelineEngine(DeepSpeedEngine):
         assert src_rank in self.grid.pp_group
 
         if self.global_rank == src_rank:
-            result = data.clone().detach()
+            result = data.clone().detach().type(dtype).to(self.device)
         else:
             result = torch.Tensor([0.]).type(dtype).to(self.device)
 
-        dist.broadcast(tensor=result,
-                       src=src_rank,
-                       group=self.mpu.get_pipe_parallel_group())
+        dist.broadcast(tensor=result, src=src_rank, group=self.mpu.get_pipe_parallel_group())
 
         return result
 
@@ -550,18 +524,14 @@ class PipelineEngine(DeepSpeedEngine):
 
             assert self.global_rank in self.grid.pp_group
             losses = torch.Tensor([self.dp_group_loss, agg_loss]).to(self.device)
-            dist.broadcast(tensor=losses,
-                           src=self.global_rank,
-                           group=self.mpu.get_pipe_parallel_group())
+            dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group())
 
         else:
             # Get loss from last stage
             src_rank = self.grid.stage_to_global(self.num_stages - 1)
             assert src_rank in self.grid.pp_group
             losses = torch.Tensor([0., 0.]).to(self.device)
-            dist.broadcast(tensor=losses,
-                           src=src_rank,
-                           group=self.grid.get_pipe_parallel_group())
+            dist.broadcast(tensor=losses, src=src_rank, group=self.grid.get_pipe_parallel_group())
             self.dp_group_loss = losses[0].clone().detach()
             agg_loss = losses[1].clone().detach()
 
@@ -638,10 +608,9 @@ class PipelineEngine(DeepSpeedEngine):
 
         # collect the partitioned input from the previous stage
         if self.is_pipe_partitioned and not self.is_first_stage():
-            part_input = PartitionedTensor.from_meta(
-                meta=inputs[0],
-                local_part=inputs[1],
-                group=self.grid.get_slice_parallel_group())
+            part_input = PartitionedTensor.from_meta(meta=inputs[0],
+                                                     local_part=inputs[1],
+                                                     group=self.grid.get_slice_parallel_group())
 
             inputs = (part_input.full(), *inputs[2:])
             inputs[0].requires_grad = True
@@ -657,23 +626,24 @@ class PipelineEngine(DeepSpeedEngine):
 
         outputs = super().forward(inputs)
 
+        # Reset activation checkpointing buffers.
+        # Need to call this between evaluation iterations
+        if not self.module.training:
+            ds_checkpointing.reset()
+
         # Partition the outputs if we are not the last stage
         if self.is_pipe_partitioned and not self.is_last_stage():
             if isinstance(outputs, tuple):
                 first_output = outputs[0]
                 # TODO: Improve pipe partitioning to pass multiple tensors that require grads
-                assert all([
-                    torch.is_tensor(elt) and elt.requires_grad is False
-                    for elt in outputs[1:]
-                ])
+                assert all([torch.is_tensor(elt) and elt.requires_grad is False for elt in outputs[1:]])
                 outputs_tail = outputs[1:]
             elif torch.is_tensor(outputs):
                 first_output = outputs
                 outputs_tail = []
             else:
                 raise ValueError("expecting a tensor or a tuple of tensors")
-            part = PartitionedTensor(tensor=first_output,
-                                     group=self.grid.get_slice_parallel_group())
+            part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
             first_output.data = torch.zeros(1)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
@@ -732,10 +702,9 @@ class PipelineEngine(DeepSpeedEngine):
         # careful to also restore the computational graph of the tensors we partitioned.
         if self.is_pipe_partitioned:
             if self.is_grad_partitioned:
-                part_output = PartitionedTensor.from_meta(
-                    meta=outputs[0],
-                    local_part=outputs[1],
-                    group=self.grid.get_slice_parallel_group())
+                part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                                                          local_part=outputs[1],
+                                                          group=self.grid.get_slice_parallel_group())
                 self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
                 outputs = (self.pipe_buffers['output_tensors'][buffer_id], *outputs[2:])
             else:
@@ -746,10 +715,9 @@ class PipelineEngine(DeepSpeedEngine):
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
             #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
-            part_grad = PartitionedTensor.from_meta(
-                meta=self.grad_layer[0],
-                local_part=self.grad_layer[1],
-                group=self.grid.get_slice_parallel_group())
+            part_grad = PartitionedTensor.from_meta(meta=self.grad_layer[0],
+                                                    local_part=self.grad_layer[1],
+                                                    group=self.grid.get_slice_parallel_group())
             grad_tensors = (part_grad.full(), *grad_tensors[2:])
             part_grad = None
             #print(f'RANK={self.global_rank} BEFORE-BWD restored grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
@@ -795,7 +763,7 @@ class PipelineEngine(DeepSpeedEngine):
                 loaded = batch[0].clone().to(self.device).detach()
                 loaded.requires_grad = loaded.is_floating_point()
             else:
-                assert isinstance(batch[0], tuple)
+                assert isinstance(batch[0], (tuple, list))
                 # Assume list or tuple
                 loaded = []
                 for x in batch[0]:
@@ -865,8 +833,7 @@ class PipelineEngine(DeepSpeedEngine):
                 assert isinstance(tensor, torch.Tensor)
                 send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
                 send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(
-                    self.device)
+                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(self.device)
                 p2p.send(send_dtype, recv_stage)
                 p2p.send(send_ndims, recv_stage)
                 p2p.send(send_shape, recv_stage)
@@ -990,17 +957,14 @@ class PipelineEngine(DeepSpeedEngine):
             if isinstance(inputs, tuple):
                 first_input = inputs[0]
                 assert all([torch.is_tensor(elt) for elt in inputs[1:]])
-                inputs_grad_tail = [
-                    elt.grad for elt in inputs[1:] if elt.grad is not None
-                ]
+                inputs_grad_tail = [elt.grad for elt in inputs[1:] if elt.grad is not None]
             elif torch.is_tensor(inputs):
                 first_input = inputs
                 inputs_grad_tail = []
             else:
                 raise ValueError("expecting a tensor or a tuple of tensors")
             assert torch.is_tensor(first_input)
-            part = PartitionedTensor(tensor=first_input.grad,
-                                     group=self.grid.get_slice_parallel_group())
+            part = PartitionedTensor(tensor=first_input.grad, group=self.grid.get_slice_parallel_group())
 
             inputs = (part.to_meta(), part.data(), *inputs_grad_tail)
 
@@ -1060,9 +1024,7 @@ class PipelineEngine(DeepSpeedEngine):
                 # XXX hardcode meta type
                 if self.is_pipe_partitioned and idx == 0 and buffer.dtype != torch.long:
                     if self.meta_buffer is None:
-                        self.meta_buffer = torch.zeros(buffer.size(),
-                                                       dtype=torch.long,
-                                                       device=self.device)
+                        self.meta_buffer = torch.zeros(buffer.size(), dtype=torch.long, device=self.device)
                     buffer = self.meta_buffer
 
                 p2p.recv(buffer, self.prev_stage)
@@ -1091,10 +1053,9 @@ class PipelineEngine(DeepSpeedEngine):
         # XXX these shapes are hardcoded for Megatron
         # Restore partitioned output if it was partitioned and we are sending full gradients
         if self.is_pipe_partitioned and not self.is_grad_partitioned:
-            part_output = PartitionedTensor.from_meta(
-                meta=outputs[0],
-                local_part=outputs[1],
-                group=self.grid.get_slice_parallel_group())
+            part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                                                      local_part=outputs[1],
+                                                      group=self.grid.get_slice_parallel_group())
             outputs[0].data = part_output.full()
             outputs = (outputs[0], *outputs[2:])
             # save for backward
@@ -1104,9 +1065,7 @@ class PipelineEngine(DeepSpeedEngine):
         if self.grad_layer is None:
             if isinstance(outputs, torch.Tensor):
                 s = list(outputs.size())
-                self.grad_layer = self._allocate_buffer(s,
-                                                        dtype=outputs.dtype,
-                                                        num_buffers=1)[0]
+                self.grad_layer = self._allocate_buffer(s, dtype=outputs.dtype, num_buffers=1)[0]
             else:
                 # XXX This is a HACK
                 # When we exchange activations/gradients, the two pipe stages
@@ -1123,17 +1082,12 @@ class PipelineEngine(DeepSpeedEngine):
                 # branches on is_grad_partitioned so we don't filter out the
                 # metadata tensor.
                 if self.is_grad_partitioned:
-                    sizes_and_dtypes = [
-                        (list(t.size()),
-                         t.dtype) for t in outputs[:2]
-                    ] + [(list(t.size()),
-                          t.dtype) for t in outputs[2:] if t.is_floating_point()]
+                    sizes_and_dtypes = [(list(t.size()), t.dtype)
+                                        for t in outputs[:2]] + [(list(t.size()), t.dtype)
+                                                                 for t in outputs[2:] if t.is_floating_point()]
                 else:
-                    sizes_and_dtypes = [(list(t.size()),
-                                         t.dtype) for t in outputs
-                                        if t.is_floating_point()]
-                self.grad_layer = self._allocate_buffers(sizes_and_dtypes,
-                                                         num_buffers=1)[0]
+                    sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()]
+                self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]
 
         if isinstance(self.grad_layer, torch.Tensor):
             p2p.recv(self.grad_layer, self.next_stage)
@@ -1142,9 +1096,7 @@ class PipelineEngine(DeepSpeedEngine):
             for idx, buffer in enumerate(self.grad_layer):
                 # XXX GPT-2 hack
                 if self.is_grad_partitioned and idx == 0 and buffer.dtype != torch.long:
-                    buffer.data = torch.zeros(buffer.size(),
-                                              dtype=torch.long,
-                                              device=self.device)
+                    buffer.data = torch.zeros(buffer.size(), dtype=torch.long, device=self.device)
                 p2p.recv(buffer, self.next_stage)
 
         if self.wall_clock_breakdown():
@@ -1163,13 +1115,10 @@ class PipelineEngine(DeepSpeedEngine):
         self.mem_status('AFTER STEP')
 
         if self.global_rank == 0 and self.monitor.enabled:
-            self.summary_events = [(f'Train/Samples/lr',
-                                    self.get_lr()[0],
-                                    self.global_samples)]
+            self.summary_events = [(f'Train/Samples/lr', self.get_lr()[0], self.global_samples)]
             if self.fp16_enabled() and hasattr(self.optimizer, 'cur_scale'):
-                self.summary_events.append((f'Train/Samples/loss_scale',
-                                            self.optimizer.cur_scale,
-                                            self.global_samples))
+                self.summary_events.append(
+                    (f'Train/Samples/loss_scale', self.optimizer.cur_scale, self.global_samples))
             self.monitor.write_events(self.summary_events)
 
         if self.wall_clock_breakdown():
@@ -1177,22 +1126,11 @@ class PipelineEngine(DeepSpeedEngine):
             self.timers('step').stop()
             if self.global_steps % self.steps_per_print() == 0:
                 self.timers.log([
-                    'batch_input',
-                    'forward_microstep',
-                    'backward_microstep',
-                    'backward_inner_microstep',
-                    'backward_allreduce_microstep',
-                    'backward_tied_allreduce_microstep',
-                    'step_microstep'
+                    'batch_input', 'forward_microstep', 'backward_microstep', 'backward_inner_microstep',
+                    'backward_allreduce_microstep', 'backward_tied_allreduce_microstep', 'step_microstep'
                 ])
             if self.global_steps % self.steps_per_print() == 0:
-                self.timers.log([
-                    'forward',
-                    'backward',
-                    'backward_inner',
-                    'backward_allreduce',
-                    'step'
-                ])
+                self.timers.log(['forward', 'backward', 'backward_inner', 'backward_allreduce', 'step'])
 
     def _zero_grads(self, inputs):
         if isinstance(inputs, torch.Tensor):
@@ -1236,10 +1174,7 @@ class PipelineEngine(DeepSpeedEngine):
         for count in range(num_buffers):
             buffer = []
             for shape, dtype in shapes_and_dtypes:
-                buffer.append(
-                    self._allocate_zeros(shape,
-                                         dtype=dtype,
-                                         requires_grad=requires_grad))
+                buffer.append(self._allocate_zeros(shape, dtype=dtype, requires_grad=requires_grad))
             buffers.append(buffer)
         return buffers
 
@@ -1298,11 +1233,9 @@ class PipelineEngine(DeepSpeedEngine):
         max_cached /= 1024**3
 
         print(
-            f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS',
-            msg,
+            f'RANK={rank} STAGE={self.stage_id} STEP={self.global_steps} MEMSTATS', msg,
             f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
-            f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
-        )
+            f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)')
 
     def module_state_dict(self):
         """Override hack to save a pipe model and return the directory path of the save.
@@ -1318,11 +1251,10 @@ class PipelineEngine(DeepSpeedEngine):
         assert self._curr_ckpt_path is not None, \
             "PipelineEngine expects module_state_dict() to be called from save_checkpoint()"
 
-        self.module.save_state_dict(self._curr_ckpt_path,
-                                    checkpoint_engine=self.checkpoint_engine)
+        self.module.save_state_dict(self._curr_ckpt_path, checkpoint_engine=self.checkpoint_engine)
         return None
 
-    def load_module_state_dict(self, state_dict, strict=True, custom_load_fn=None):
+    def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None):
         """Override hack to instead use a directory path.
 
         This is important because pipeline models checkpoint by layer instead of rank.
@@ -1334,6 +1266,7 @@ class PipelineEngine(DeepSpeedEngine):
             strict (bool, optional): Strict state loading. Defaults to True.
         """
         assert custom_load_fn is None, "custom_load_fn not supported w. pipeline parallelism"
+        state_dict = checkpoint['module']
         if (state_dict is not None) and (not isinstance(state_dict, str)):
             super().load_module_state_dict(state_dict, strict)
             return
@@ -1367,9 +1300,7 @@ class PipelineEngine(DeepSpeedEngine):
             # For each instruction in the step
             for cmd in step_cmds:
                 if type(cmd) not in self._INSTRUCTION_MAP:
-                    raise RuntimeError(
-                        f'{self.__class__.__name__} does not understand instruction {repr(cmd)}'
-                    )
+                    raise RuntimeError(f'{self.__class__.__name__} does not understand instruction {repr(cmd)}')
 
                 # Equivalent to: self._exec_forward_pass(buffer_id=0)
                 self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self)
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index acf066bb710c68eb157cb197ff99b7ecc845ef33..7bf9c7d973b197b1ca5de19fb3ab21e07a32b197 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import glob
@@ -45,6 +48,7 @@ class LayerSpec:
             LayerSpec(torch.nn.Linear, self.hidden_hidden, self.out_dim)]
         ]
     """
+
     def __init__(self, typename, *module_args, **module_kwargs):
         self.typename = typename
         self.module_args = module_args
@@ -59,9 +63,7 @@ class LayerSpec:
             self.global_rank = -1
 
     def __repr__(self):
-        return ds_utils.call_to_str(self.typename.__name__,
-                                    self.module_args,
-                                    self.module_kwargs)
+        return ds_utils.call_to_str(self.typename.__name__, self.module_args, self.module_kwargs)
 
     def build(self, log=False):
         """Build the stored specification."""
@@ -72,13 +74,8 @@ class LayerSpec:
 
 
 class TiedLayerSpec(LayerSpec):
-    def __init__(self,
-                 key,
-                 typename,
-                 *module_args,
-                 forward_fn=None,
-                 tied_weight_attr='weight',
-                 **module_kwargs):
+
+    def __init__(self, key, typename, *module_args, forward_fn=None, tied_weight_attr='weight', **module_kwargs):
         super().__init__(typename, *module_args, **module_kwargs)
         self.key = key
         self.forward_fn = forward_fn
@@ -120,6 +117,7 @@ class PipelineModule(nn.Module):
         activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
         checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering.
     """
+
     def __init__(self,
                  layers,
                  num_stages=None,
@@ -154,9 +152,7 @@ class PipelineModule(nn.Module):
                 seed_str = self.seed_fn.__name__
             except AttributeError:
                 seed_str = None
-            print(
-                f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}'
-            )
+            print(f'SEED_LAYERS={self.seed_layers} BASE_SEED={self.base_seed} SEED_FN={seed_str}')
 
         # Setup world info
         self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
@@ -173,15 +169,13 @@ class PipelineModule(nn.Module):
             if topology is None:
                 if self.world_size % self.num_stages != 0:
                     raise RuntimeError(
-                        f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})'
-                    )
+                        f'num_stages ({self.num_stages}) must divide distributed world size ({self.world_size})')
                 dp = self.world_size // num_stages
                 topology = PipeDataParallelTopology(num_pp=num_stages, num_dp=dp)
                 self._topo = topology
 
         # Construct communicators for pipeline topology
-        self._grid = PipelineParallelGrid(process_group=self.world_group,
-                                          topology=self._topo)
+        self._grid = PipelineParallelGrid(process_group=self.world_group, topology=self._topo)
 
         self.stage_id = self._topo.get_coord(self.global_rank).pipe
 
@@ -245,9 +239,7 @@ class PipelineModule(nn.Module):
                     self.forward_funcs.append(self.tied_modules[layer.key])
                 else:
                     # User specified fn with args (module, input)
-                    self.forward_funcs.append(
-                        partial(layer.forward_fn,
-                                self.tied_modules[layer.key]))
+                    self.forward_funcs.append(partial(layer.forward_fn, self.tied_modules[layer.key]))
 
             # LayerSpec objects contain an nn.Module that should be allocated now.
             elif isinstance(layer, LayerSpec):
@@ -304,8 +296,7 @@ class PipelineModule(nn.Module):
                 idxs.append(idx)
 
         if len(idxs) == 0:
-            raise RuntimeError(
-                f"Partitioning '{layername}' found no valid layers to partition.")
+            raise RuntimeError(f"Partitioning '{layername}' found no valid layers to partition.")
         return idxs
 
     def forward(self, forward_input):
@@ -327,8 +318,7 @@ class PipelineModule(nn.Module):
                 for idx, layer in enumerate(self.forward_funcs[start:end]):
                     self.curr_layer = idx + self._local_start
                     if self.seed_layers:
-                        new_seed = (self.base_seed *
-                                    local_micro_offset) + self.curr_layer
+                        new_seed = (self.base_seed * local_micro_offset) + self.curr_layer
                         if self.seed_fn:
                             self.seed_fn(new_seed)
                         else:
@@ -346,8 +336,7 @@ class PipelineModule(nn.Module):
             num_layers = len(self.forward_funcs)
             x = forward_input
             for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
-                end_idx = min(start_idx + self.activation_checkpoint_interval,
-                              num_layers)
+                end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
 
                 funcs = self.forward_funcs[start_idx:end_idx]
                 # Since we either pass tensors or tuples of tensors without unpacking, we
@@ -356,10 +345,7 @@ class PipelineModule(nn.Module):
                     x = (x, )
 
                 if self._is_checkpointable(funcs):
-                    x = self.activation_checkpoint_func(
-                        exec_range_func(start_idx,
-                                        end_idx),
-                        *x)
+                    x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x)
                 else:
                     x = exec_range_func(start_idx, end_idx)(*x)
         return x
@@ -376,19 +362,16 @@ class PipelineModule(nn.Module):
         # Each stage gets a simple uniform number of layers.
         if method == 'uniform':
             num_layers = len(self._layer_specs)
-            self.parts = ds_utils.partition_uniform(num_items=num_layers,
-                                                    num_parts=num_stages)
+            self.parts = ds_utils.partition_uniform(num_items=num_layers, num_parts=num_stages)
         elif method == 'parameters':
             param_counts = self._count_layer_params()
-            self.parts = ds_utils.partition_balanced(weights=param_counts,
-                                                     num_parts=num_stages)
+            self.parts = ds_utils.partition_balanced(weights=param_counts, num_parts=num_stages)
         elif method.startswith('type:'):
             layertype = method.split(':')[1]
             binary_weights = [0] * len(self._layer_specs)
             for idx in self._find_layer_type(layertype):
                 binary_weights[idx] = 1
-            self.parts = ds_utils.partition_balanced(weights=binary_weights,
-                                                     num_parts=num_stages)
+            self.parts = ds_utils.partition_balanced(weights=binary_weights, num_parts=num_stages)
         elif method == 'profile':
             raise NotImplementedError(f'Partitioning method {method} not implemented.')
         else:
@@ -436,8 +419,7 @@ class PipelineModule(nn.Module):
     def _synchronize_tied_weights(self):
         for key, comm in self.tied_comms.items():
             dist.broadcast(
-                getattr(comm['module'],
-                        comm['weight_attr']),
+                getattr(comm['module'], comm['weight_attr']),
                 src=min(comm['ranks']),
                 group=comm['group'],
             )
@@ -467,14 +449,9 @@ class PipelineModule(nn.Module):
                     tied_ranks = []
                     for s in sorted(tied_stages):
                         if self._grid.get_slice_parallel_world_size() > 1:
-                            tied_ranks.append(
-                                self._grid.stage_to_global(stage_id=s,
-                                                           data=dp,
-                                                           model=mp))
+                            tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp, model=mp))
                         else:
-                            tied_ranks.append(
-                                self._grid.stage_to_global(stage_id=s,
-                                                           data=dp))
+                            tied_ranks.append(self._grid.stage_to_global(stage_id=s, data=dp))
                     group = dist.new_group(ranks=tied_ranks)
 
                     # Record this tied module if we own a local copy of it.
@@ -587,7 +564,7 @@ class PipelineModule(nn.Module):
             start, end = 0, num_layers
         layer_list = self.forward_funcs[start:end]
 
-        os.makedirs(save_dir, exist_ok=True)
+        checkpoint_engine.makedirs(save_dir, exist_ok=True)
         for idx, layer in enumerate(layer_list):
             model_ckpt_path = self.ckpt_layer_path(save_dir, start + idx)
             if not hasattr(layer, 'state_dict'):
@@ -599,10 +576,7 @@ class PipelineModule(nn.Module):
             # It is expected that the garbage collector will reclaim the cloned tensor storage to avoid memory bloat.
             # See https://pytorch.org/docs/stable/notes/serialization.html#preserve-storage-sharing
             orig_state_dict = layer.state_dict()
-            final_state_dict = type(orig_state_dict)(
-                {k: v.clone()
-                 for k,
-                 v in orig_state_dict.items()})
+            final_state_dict = type(orig_state_dict)({k: v.clone() for k, v in orig_state_dict.items()})
             checkpoint_engine.save(final_state_dict, model_ckpt_path)
 
     def load_state_dir(self, load_dir, checkpoint_engine, strict=True):
@@ -616,10 +590,9 @@ class PipelineModule(nn.Module):
             mp_rank = self._grid.get_slice_parallel_rank()
             mp_world_size = self._grid.get_slice_parallel_world_size()
 
-            sd_loader = SDLoaderFactory.get_sd_loader(
-                model_ckpt_list,
-                version=2.0,
-                checkpoint_engine=checkpoint_engine)
+            sd_loader = SDLoaderFactory.get_sd_loader(model_ckpt_list,
+                                                      version=2.0,
+                                                      checkpoint_engine=checkpoint_engine)
             load_path, checkpoint, _ = sd_loader.load(mp_world_size, mp_rank, module_key=None, is_pipe_parallel=True)
 
             layer.load_state_dict(checkpoint)
@@ -636,8 +609,7 @@ class PipelineModule(nn.Module):
         # Some layers like torch.nn.Embedding will not receive grads if checkpointed, which breaks things.
         # I presume it's related to the discrete inputs that cannot require_grad? Need to revisit.
         if self.__class__.__name__ in ('GPTModelPipe', 'GPT2ModelPipe'):
-            return all('ParallelTransformerLayerPipe' in f.__class__.__name__
-                       for f in funcs)
+            return all('ParallelTransformerLayerPipe' in f.__class__.__name__ for f in funcs)
         if self.checkpointable_layers is not None:
             return all(f.__class__.__name__ in self.checkpointable_layers for f in funcs)
 
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 93c61114c558548b6bad259e7ce07f13f952b796..31b9a14bf0f42186229c3ab19cc2bbe635f91e79 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pickle
 import typing
@@ -111,12 +112,10 @@ def send_obj(msg: typing.Any, dest: int):
     # serialize the message
     msg = pickle.dumps(msg)
     # construct a tensor to send
-    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(
-        get_accelerator().device_name())
+    msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name())
 
     # Send meta and message
-    length_tensor = torch.tensor([len(msg)],
-                                 dtype=torch.long).to(get_accelerator().device_name())
+    length_tensor = torch.tensor([len(msg)], dtype=torch.long).to(get_accelerator().device_name())
     dist.send(length_tensor, dst=dest)
     dist.send(msg, dst=dest)
 
@@ -135,8 +134,7 @@ def recv_obj(sender: int) -> typing.Any:
     dist.recv(length, src=sender)
 
     # Receive and deserialize
-    msg = torch.empty(length.item(),
-                      dtype=torch.uint8).to(get_accelerator().device_name())
+    msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
     msg = pickle.loads(msg.cpu().numpy().tobytes())
diff --git a/deepspeed/runtime/pipe/schedule.py b/deepspeed/runtime/pipe/schedule.py
index 19e73da17fb96dd01f5ae25fa62b906d2a805ea2..21bf24dad8edcffd54d05d0c7092ab225e1e39dc 100644
--- a/deepspeed/runtime/pipe/schedule.py
+++ b/deepspeed/runtime/pipe/schedule.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from ..utils import call_to_str
 
@@ -42,6 +45,7 @@ class PipeSchedule(ABC):
         stages (int): The number of pipeline stages.
         stage_id (int): The pipe stage that will execute the generated schedule.
     """
+
     def __init__(self, micro_batches, stages, stage_id):
         super().__init__()
         self.micro_batches = micro_batches
@@ -131,6 +135,7 @@ class PipeSchedule(ABC):
 class InferenceSchedule(PipeSchedule):
     """A schedule for inferencing batches using pipeline parallelism.
     """
+
     def steps(self):
         """"""
         prev_micro_batch_id = -1
@@ -188,6 +193,7 @@ class TrainSchedule(PipeSchedule):
     convergence follows that of a data parallel approach with the same batch
     size.
     """
+
     def steps(self):
         """"""
         prev_micro_batch_id = -1
@@ -206,19 +212,15 @@ class TrainSchedule(PipeSchedule):
 
             # Exchange activations
             if is_forward:
-                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
-                        self.prev_stage):
-                    cmds.append(RecvActivation(curr_buffer))
-                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
-                        self.prev_stage):
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.prev_stage):
                     cmds.append(SendGrad(prev_buffer))
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.prev_stage):
+                    cmds.append(RecvActivation(curr_buffer))
             else:
-                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
-                        self.next_stage):
-                    cmds.append(SendActivation(prev_buffer))
-                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
-                        self.next_stage):
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(self.next_stage):
                     cmds.append(RecvGrad(curr_buffer))
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(self.next_stage):
+                    cmds.append(SendActivation(prev_buffer))
 
             # First/last stage loads
             if self.stage_id == 0 or self.stage_id == self.stages - 1:
@@ -243,9 +245,14 @@ class TrainSchedule(PipeSchedule):
             yield cmds
 
     def num_pipe_buffers(self):
-        """As many buffers as the distance from this stage to the last stage.
+        """Return the number of pipeline buffers required for this stage.
+
+        This is equivalent to the maximum number of in-flight forward passes,
+        since we need to remember the activations of forward passes in order
+        to run backpropagation. For synchronous 1F1B, this is equivalent to
+        the index difference between this stage and the last stage.
         """
-        buffers = min(self.stages - self.stage_id + 1, self.micro_batches)
+        buffers = min(self.stages - self.stage_id, self.micro_batches)
         return max(2, buffers)
 
     def _step_to_micro_batch(self, step_id):
@@ -295,6 +302,7 @@ class DataParallelSchedule(PipeSchedule):
     """An example schedule that trains using traditional data parallelism with gradient
     accumulation.
     """
+
     def steps(self):
         """"""
         for step_id in range(self.micro_batches):
@@ -325,6 +333,7 @@ class PipeInstruction:
     Args:
         kwargs (optional): keyword arguments to store as members
     """
+
     def __init__(self, **kwargs):
         self.name = self.__class__.__name__
         self.kwargs = kwargs
@@ -369,6 +378,7 @@ class BufferOpInstruction(PipeInstruction):
     Args:
         buffer_id (int): the index of the pipeline buffer() to modify.
     """
+
     def __init__(self, buffer_id, **kwargs):
         super().__init__(buffer_id=buffer_id, **kwargs)
 
diff --git a/deepspeed/runtime/pipe/topology.py b/deepspeed/runtime/pipe/topology.py
index 6c0cd96440fbf9a785c556b1ebcb6cc4fa97073c..328c199071009669f41a6c2f531d1e741bb690f4 100644
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
@@ -1,4 +1,7 @@
-# Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed import comm as dist
 
@@ -19,6 +22,7 @@ class ProcessTopology:
 
     Some methods return ProcessCoord namedtuples.
     """
+
     def __init__(self, axes, dims):
         """Create a mapping of n-dimensional tensor coordinates to linear indices.
 
@@ -62,12 +66,7 @@ class ProcessTopology:
         """Return a list of the axis names in the ordering of the topology. """
         return self.axes
 
-    def get_rank_repr(self,
-                      rank,
-                      omit_axes=['data',
-                                 'pipe'],
-                      inner_sep='_',
-                      outer_sep='-'):
+    def get_rank_repr(self, rank, omit_axes=['data', 'pipe'], inner_sep='_', outer_sep='-'):
         """Return a string representation of a rank.
 
         This method is primarily used for checkpointing model data.
@@ -181,6 +180,7 @@ class ProcessTopology:
         Returns:
             The list of ranks whose coordinates match filter_kwargs.
         """
+
         def _filter_helper(x):
             for key, val in filter_kwargs.items():
                 if getattr(x, key) != val:
@@ -236,12 +236,14 @@ class PipeDataParallelTopology(ProcessTopology):
         reductions to use high-bandwidth intra-node links and lower-volume
         pipeline communications to use low-bandwidth inter-node links.
     """
+
     def __init__(self, num_pp, num_dp):
         super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
 
 
 class PipeModelDataParallelTopology(ProcessTopology):
     """ A topology for hybrid pipeline, model, and data parallelism. """
+
     def __init__(self, num_pp, num_mp, num_dp):
         super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
 
@@ -268,6 +270,7 @@ class PipelineParallelGrid:
     data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
     for data_parallel_id = 1.
     """
+
     def __init__(self, topology=None, process_group=None):
         # TODO use process_group if provided
         self.global_rank = dist.get_rank()
diff --git a/deepspeed/runtime/progressive_layer_drop.py b/deepspeed/runtime/progressive_layer_drop.py
index 65e1a563096d955161fbde1c8e1b344030354d42..bd3be8f13011a4c9322839ff8a631778e7fb6847 100755
--- a/deepspeed/runtime/progressive_layer_drop.py
+++ b/deepspeed/runtime/progressive_layer_drop.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 from deepspeed.utils import log_dist
@@ -13,6 +16,7 @@ class ProgressiveLayerDrop(object):
         The lower the theta value, the faster the training speed. Default value: 0.5.
         gamma (float): a hyper-parameter that controls how fast the drop ratio increases. Default value: 0.001.
     """
+
     def __init__(self, theta=0.5, gamma=0.001):
         super().__init__()
 
@@ -29,6 +33,7 @@ class ProgressiveLayerDrop(object):
         return self.current_theta
 
     def update_state(self, global_step):
+
         def _prob(x, gamma, p):
             return (1. - p) * np.exp(-gamma * x) + p
 
diff --git a/deepspeed/runtime/quantize.py b/deepspeed/runtime/quantize.py
index 81a7bd56991beed3937f487a54465800ca26b6e2..ad553138b2d7c8095aa1568f56a1f1a0b25e16d3 100755
--- a/deepspeed/runtime/quantize.py
+++ b/deepspeed/runtime/quantize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import math
@@ -9,6 +12,7 @@ TWO_D_PARAMS = 6
 
 
 class Quantizer(object):
+
     def __init__(self,
                  q_groups=1,
                  q_mixed_fp16=False,
@@ -39,17 +43,12 @@ class Quantizer(object):
         result = False
         for index in range(self.layer_num):
             if self.q_start_bits[index] != self.q_target_bits:
-                next_step = self.qsteps + (
-                    TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
+                next_step = self.qsteps + (TWO_D_PARAMS * (self.layer_num if self.layer_num != 0 else 1))
                 if next_step >= self.q_period[index]:
                     result = True
         return result
 
-    def quantize(self,
-                 parameter_group,
-                 overflow,
-                 eigenvalue_enabled,
-                 block_eigenvalue={}):
+    def quantize(self, parameter_group, overflow, eigenvalue_enabled, block_eigenvalue={}):
 
         if overflow and not eigenvalue_enabled:
             return
@@ -65,7 +64,8 @@ class Quantizer(object):
                     if block_eigenvalue is None:
                         eigenvalue, layer_id = None, 0
                     else:
-                        eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None, 0)
+                        eigenvalue, layer_id = block_eigenvalue[param_id] if param_id in block_eigenvalue else (None,
+                                                                                                                0)
                     if eigenvalue is not None:
                         factor = 1 + math.floor(eigenvalue * 4)
                         p.data = self.compute_quantization(p.data, layer_id, factor)
@@ -91,15 +91,11 @@ class Quantizer(object):
         if self.q_type == 'symmetric':
             scale = 2 * torch.max(torch.abs(g_min), torch.abs(g_max)) / q_range
             zero_point = 0.
-            input_flat = (input_flat / scale + p).round().clamp(
-                -(q_range >> 1),
-                (q_range >> 1) - 1) * scale
+            input_flat = (input_flat / scale + p).round().clamp(-(q_range >> 1), (q_range >> 1) - 1) * scale
         elif self.q_type == 'asymmetric':
             scale = (g_max - g_min) / q_range
             zero_point = (g_min / scale).round() * scale
-            input_flat = ((input_flat - zero_point) / scale + p).round().clamp(
-                0,
-                (q_range - 1)) * scale + zero_point
+            input_flat = ((input_flat - zero_point) / scale + p).round().clamp(0, (q_range - 1)) * scale + zero_point
         output = input_flat.reshape(inputs.shape).contiguous()
         return output
 
@@ -126,8 +122,7 @@ class Quantizer(object):
 
     def mixed_fp16_quantize(self, input, input_q, index):
         if self.q_mixed_fp16 and self.q_start_bits[index] >= (self.q_target_bits - 1):
-            input_q = input * self.quantize_real_ratio + (
-                1 - self.quantize_real_ratio) * input_q
+            input_q = input * self.quantize_real_ratio + (1 - self.quantize_real_ratio) * input_q
             return input_q
         return input_q
 
@@ -152,15 +147,12 @@ class Quantizer(object):
 
         if self.use_quantizer_kernel:
             if input.start_bits <= 2:
-                raise ValueError(
-                    'Quantization bit is too low, please do it without quantization kernel!'
-                )
-            input_q = ds_quantizer(
-                input.data.clone(),
-                self.q_groups,
-                input.start_bits,
-                asym=False if self.q_type == 'symmetric' else True,
-                sr=False if self.q_rounding == 'nearest_neighbor' else True)
+                raise ValueError('Quantization bit is too low, please do it without quantization kernel!')
+            input_q = ds_quantizer(input.data.clone(),
+                                   self.q_groups,
+                                   input.start_bits,
+                                   asym=False if self.q_type == 'symmetric' else True,
+                                   sr=False if self.q_rounding == 'nearest_neighbor' else True)
         else:
             if input.start_bits >= 3:
                 input_flat = self.quantize_highbit(input.data, input.start_bits)
diff --git a/deepspeed/runtime/sparse_tensor.py b/deepspeed/runtime/sparse_tensor.py
index 49dedbe14b7d7e2b9f91d01f3c83206c53a140af..f0bb5c75530eeea5b7ef1f36a29c32f94e901843 100644
--- a/deepspeed/runtime/sparse_tensor.py
+++ b/deepspeed/runtime/sparse_tensor.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Implementation of a compressed sparse tensor. Similar in
 functionality to TensorFlow's IndexedSlices implementation.
 """
@@ -10,6 +12,7 @@ import torch
 
 class SparseTensor(object):
     """ Compressed Sparse Tensor """
+
     def __init__(self, dense_tensor=None):
         self.orig_dense_tensor = dense_tensor
         self.is_sparse = dense_tensor.is_sparse
@@ -29,9 +32,7 @@ class SparseTensor(object):
             self.dense_size = None
 
     def to_coo_tensor(self):
-        return torch.sparse_coo_tensor(self.indices.unsqueeze(0),
-                                       self.values,
-                                       self.dense_size)
+        return torch.sparse_coo_tensor(self.indices.unsqueeze(0), self.values, self.dense_size)
 
     @staticmethod
     def type():
@@ -40,10 +41,7 @@ class SparseTensor(object):
     def to_dense(self):
         it = self.indices.unsqueeze(1)
         full_indices = torch.cat([it for _ in range(self.dense_size[1])], dim=1)
-        return self.values.new_zeros(self.dense_size).scatter_add_(
-            0,
-            full_indices,
-            self.values)
+        return self.values.new_zeros(self.dense_size).scatter_add_(0, full_indices, self.values)
 
     def sparse_size(self):
         index_size = list(self.indices.size())
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
index 1f5c97f27bd379eeb970a74ea457c67fbdaf3f3c..c2db85d1ba58f16dbe59b1dcd3d3066ab11ff6d2 100755
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -18,6 +19,7 @@ AUTO_MODULE_KEY = 'auto'
 
 
 class SDLoaderFactory:
+
     @staticmethod
     def get_sd_loader_json(json_file, checkpoint_engine):
         if isinstance(json_file, str):
@@ -33,10 +35,7 @@ class SDLoaderFactory:
         mp_size = data.get('mp_size', 0)
         if sd_type.lower() in ['bloom', 'ds_model']:
             return data
-        return SDLoaderFactory.get_sd_loader(ckpt_list,
-                                             checkpoint_engine,
-                                             sd_type,
-                                             version)
+        return SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine, sd_type, version)
 
     @staticmethod
     def get_sd_loader(ckpt_list, checkpoint_engine, sd_type='Megatron', version=None):
@@ -47,12 +46,12 @@ class SDLoaderFactory:
 
 
 class SDLoaderBase(ABC):
+
     def __init__(self, ckpt_list, version, checkpoint_engine):
         self.module_key = None
         self.ckpt_list = ckpt_list
         self.version = version
-        self.checkpoint_engine = TorchCheckpointEngine(
-        ) if checkpoint_engine is None else checkpoint_engine
+        self.checkpoint_engine = TorchCheckpointEngine() if checkpoint_engine is None else checkpoint_engine
         self.check_ckpt_list()
 
     def load(self,
@@ -99,9 +98,9 @@ class SDLoaderBase(ABC):
                 loc: storage)
 
             if quantize:
-                quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                               mp_size=mp_world_size)
-                sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits, quantize_groups)
+                quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
+                sd_module, all_scales = quantizer.sd_quantize_megatron(self.get_module(sd), quantize_bits,
+                                                                       quantize_groups)
                 self.set_module(sd, sd_module)
             else:
                 all_scales = None
@@ -118,17 +117,10 @@ class SDLoaderBase(ABC):
         assert num_ckpt % mp_world_size == 0, 'Invalid checkpoints and world size for sd merge'
 
         num_to_merge = num_ckpt // mp_world_size
-        ckpt_list = [
-            self.ckpt_list[i] for i in range(num_to_merge * mp_rank,
-                                             num_to_merge * (mp_rank + 1))
-        ]
+        ckpt_list = [self.ckpt_list[i] for i in range(num_to_merge * mp_rank, num_to_merge * (mp_rank + 1))]
 
         logger.info(f"mp_rank: {mp_rank}, ckpt_list: {ckpt_list}")
-        sd_list = [
-            self.checkpoint_engine.load(ckpt,
-                                        map_location=lambda storage,
-                                        loc: storage) for ckpt in ckpt_list
-        ]
+        sd_list = [self.checkpoint_engine.load(ckpt, map_location=lambda storage, loc: storage) for ckpt in ckpt_list]
         return sd_list
 
     def get_split_state_dict(self, mp_world_size, mp_rank):
@@ -139,18 +131,15 @@ class SDLoaderBase(ABC):
         ckpt_index = mp_rank // num_to_split
         ckpt_offset = mp_rank % num_to_split
 
-        logger.info(
-            f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}"
-        )
+        logger.info(f"mp_rank: {mp_rank}, ckpt_list: {self.ckpt_list[ckpt_index]}, offset: {ckpt_offset}")
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index],
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[ckpt_index], map_location=lambda storage, loc: storage)
 
         return sd, num_to_split, ckpt_offset
 
     def _choose_module_key(self, sd):
-        assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
+        assert not ('module' in sd
+                    and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
         assert 'module' in sd or 'model' in sd, "checkpoint contains neither 'model' or 'module' keys, not sure how to proceed"
         if 'module' in sd:
             return 'module'
@@ -178,32 +167,19 @@ class SDLoaderBase(ABC):
         #logger.info(f'checkpoint file list: {self.ckpt_list}')
         assert len(self.ckpt_list) > 0
 
-        sd = self.checkpoint_engine.load(self.ckpt_list[0],
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
 
         # check checkpoint count is same with saved mp_world_size
         if 'mp_world_size' in sd.keys():
-            assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
+            assert len(self.ckpt_list) == sd[
+                'mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
 
     @abstractmethod
-    def merge_state_dict(self,
-                         mp_world_size,
-                         mp_rank,
-                         quantize,
-                         quantize_bits,
-                         groups,
-                         mlp_extra_grouping):
+    def merge_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping):
         pass
 
     @abstractmethod
-    def split_state_dict(self,
-                         mp_world_size,
-                         mp_rank,
-                         quantize,
-                         quantize_bits,
-                         groups,
-                         mlp_extra_grouping):
+    def split_state_dict(self, mp_world_size, mp_rank, quantize, quantize_bits, groups, mlp_extra_grouping):
         pass
 
     @abstractmethod
@@ -212,6 +188,7 @@ class SDLoaderBase(ABC):
 
 
 class MegatronSDLoader(SDLoaderBase):
+
     def __init__(self, ckpt_list, version, checkpoint_engine):
         super().__init__(ckpt_list, version, checkpoint_engine)
         """
@@ -340,40 +317,27 @@ class MegatronSDLoader(SDLoaderBase):
         ckpt_ver = self.get_checkpoint_version(ds_sd)
         logger.info(f"checkpoint version: {ckpt_ver}")
         if quantize:
-            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                           mp_size=mp_world_size)
+            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
 
         for key in keys:
             value_list = [sd[key] for sd in client_sd_list]
 
             if "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
                 if quantize:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key,
-                                                    merge_dim=1)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key, merge_dim=1)
                 new_client_sd[key] = torch.cat(value_list, axis=1)
             elif "attention.query_key_value" in key:
                 if quantize and "attention.query_key_value.weight" in key:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key)
                     new_client_sd[key] = torch.cat(value_list, axis=0)
                 else:
                     if quantize:
                         new_client_sd[key] = torch.cat(value_list, axis=0)
                     else:
-                        new_client_sd[key] = self.merge_query_key_value(
-                            value_list,
-                            ckpt_ver)
+                        new_client_sd[key] = self.merge_query_key_value(value_list, ckpt_ver)
             elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key:
                 if quantize and "mlp.dense_h_to_4h.weight" in key:
-                    value_list = quantizer.Quantize(value_list,
-                                                    quantize_bits,
-                                                    groups,
-                                                    key=key)
+                    value_list = quantizer.Quantize(value_list, quantize_bits, groups, key=key)
                 new_client_sd[key] = torch.cat(value_list, axis=0)
             else:
                 new_client_sd[key] = value_list[0]
@@ -402,8 +366,7 @@ class MegatronSDLoader(SDLoaderBase):
         logger.info(f"checkpoint version: {ckpt_ver}")
 
         if quantize:
-            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping,
-                                           mp_size=mp_world_size)
+            quantizer = WeightQuantization(mlp_extra_grouping=mlp_extra_grouping, mp_size=mp_world_size)
 
         for key in client_sd.keys():
             value = client_sd[key]
@@ -419,11 +382,7 @@ class MegatronSDLoader(SDLoaderBase):
                 if quantize and "attention.query_key_value.weight" in key:
                     q_vals = quantizer.Quantize([value], quantize_bits, groups, key)
                     value = q_vals[0]
-                new_client_sd[key] = self.split_query_key_value(
-                    value,
-                    num_to_split,
-                    ckpt_offset,
-                    ckpt_ver)
+                new_client_sd[key] = self.split_query_key_value(value, num_to_split, ckpt_offset, ckpt_ver)
             elif "mlp.dense_h_to_4h.weight" in key or "word_embeddings.weight" in key or "mlp.dense_h_to_4h.bias" in key or "final_linear.weight" in key:
                 assert value.shape[0] % num_to_split == 0
                 split_size = value.shape[0] // num_to_split
@@ -443,16 +402,11 @@ class MegatronSDLoader(SDLoaderBase):
 
     def sanity_check(self, ckpt_file_name):
         keys_to_check = [
-            "attention.dense.weight",
-            "mlp.dense_4h_to_h.weight",
-            "attention.query_key_value",
-            "mlp.dense_h_to_4h.weight",
-            "mlp.dense_h_to_4h.bias"
+            "attention.dense.weight", "mlp.dense_4h_to_h.weight", "attention.query_key_value",
+            "mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias"
         ]
 
-        sd = self.checkpoint_engine.load(ckpt_file_name,
-                                         map_location=lambda storage,
-                                         loc: storage)
+        sd = self.checkpoint_engine.load(ckpt_file_name, map_location=lambda storage, loc: storage)
 
         # partial_key is a sub-string of one key in the sd
         def check_key_exist(partial_key, sd):
@@ -465,10 +419,9 @@ class MegatronSDLoader(SDLoaderBase):
             return found
 
         for key in keys_to_check:
-            assert check_key_exist(key, self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}'
+            assert check_key_exist(key,
+                                   self.get_module(sd)), f'key: {key} is not found in the checkpoint {ckpt_file_name}'
 
     def get_checkpoint_version(self, state_dict):
         # Use 0 if version info doesn't exist
-        return self.version if self.version is not None else state_dict.get(
-            'checkpoint_version',
-            0)
+        return self.version if self.version is not None else state_dict.get('checkpoint_version', 0)
diff --git a/deepspeed/runtime/swap_tensor/__init__.py b/deepspeed/runtime/swap_tensor/__init__.py
index 8c18558e94238db318152d650364e4bbb8af5099..208299fb8c50f73468d293b6fa5dca71649d62e7 100644
--- a/deepspeed/runtime/swap_tensor/__init__.py
+++ b/deepspeed/runtime/swap_tensor/__init__.py
@@ -1,4 +1,4 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index 6a7014c1863c8782d82d3c02062c49332d840b88..df4a38380089ad87f6121b836603075e08689c49 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -1,7 +1,7 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.runtime.swap_tensor.constants import *
@@ -19,26 +19,11 @@ def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
         return {
-            AIO_BLOCK_SIZE:
-            get_scalar_param(aio_dict,
-                             AIO_BLOCK_SIZE,
-                             AIO_BLOCK_SIZE_DEFAULT),
-            AIO_QUEUE_DEPTH:
-            get_scalar_param(aio_dict,
-                             AIO_QUEUE_DEPTH,
-                             AIO_QUEUE_DEPTH_DEFAULT),
-            AIO_THREAD_COUNT:
-            get_scalar_param(aio_dict,
-                             AIO_THREAD_COUNT,
-                             AIO_THREAD_COUNT_DEFAULT),
-            AIO_SINGLE_SUBMIT:
-            get_scalar_param(aio_dict,
-                             AIO_SINGLE_SUBMIT,
-                             AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS:
-            get_scalar_param(aio_dict,
-                             AIO_OVERLAP_EVENTS,
-                             AIO_OVERLAP_EVENTS_DEFAULT)
+            AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
+            AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT)
         }
 
     return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py
index 45614abd794ed01a35fd9ee9e2b1d16dfedd39bc..152b5b70a51561b79570613cc20ffb82eb3f28e6 100644
--- a/deepspeed/runtime/swap_tensor/async_swapper.py
+++ b/deepspeed/runtime/swap_tensor/async_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 import torch
@@ -15,6 +16,7 @@ ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait'
 
 
 class AsyncTensorSwapper(object):
+
     def __init__(self, aio_handle, numel_alignment, timers):
         self.free_buffer_index = []
         self.swapping_buffer_index = []
@@ -70,9 +72,7 @@ class AsyncTensorSwapper(object):
         if dist.get_rank() == 0:
             element_size = torch.tensor([], dtype=self.dtype).element_size()
             swapped_GB = (self.num_elements_swapped * element_size) / (1024**3)
-            logger.debug(
-                f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB'
-            )
+            logger.debug(f'{message} num_elems = {self.num_elements_swapped}, {swapped_GB:5.2f} GB')
 
     def _swap_out_tensor(self, tensor, swap_path):
         assert len(self.all_buffers) > 0
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index 752ec8dcaacf8a77df1196d1c100ce447127e911..4c9722bc4e4f24a2de542d19faf09e591fbc6f02 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -1,10 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+AIO
 """
-#########################################
-# AIO
-#########################################
 AIO_FORMAT = '''
 "aio": {
   "block_size": 1048576,
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
index 70b806c3a15f4545780388a91d4fa91b6f524546..12be256f8055d91f17ccc8bf98a7143a4bf922e0 100644
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -17,6 +18,7 @@ from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPoo
 
 
 class FlattenedTensorSwapInfo(object):
+
     def __init__(self, path, length, offset):
         self.path = path
         self.offset = offset
@@ -24,6 +26,7 @@ class FlattenedTensorSwapInfo(object):
 
 
 class OptimizerStateSwapInfo(object):
+
     def __init__(self, parameter, numel, base_folder):
         self.tensors = []
         self.param_id = id(parameter)
@@ -66,13 +69,8 @@ class OptimizerStateSwapInfo(object):
         gradient_paths = []
         for offset, length in zip(offsets, lengths):
             if not offset in self.swapped_gradients.keys():
-                path = os.path.join(
-                    self.swap_folder,
-                    f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
-                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(
-                    path,
-                    length,
-                    offset)
+                path = os.path.join(self.swap_folder, f'{self.param_id}_gradient_{offset}_{length}.tensor.swp')
+                self.swapped_gradients[offset] = FlattenedTensorSwapInfo(path, length, offset)
 
             gradient_paths.append(self.swapped_gradients[offset].path)
 
@@ -86,11 +84,7 @@ class OptimizerStateSwapInfo(object):
 
     def get_swap_gradient_buffers(self, swap_buffer):
         assert self.numel() <= swap_buffer.numel()
-        return [
-            swap_buffer.narrow(0,
-                               grad.offset,
-                               grad.length) for grad in self.swapped_gradients.values()
-        ]
+        return [swap_buffer.narrow(0, grad.offset, grad.length) for grad in self.swapped_gradients.values()]
 
     def get_swap_gradient_paths(self):
         return [grad.path for grad in self.swapped_gradients.values()]
@@ -116,24 +110,15 @@ SWAP_OUT_GRADIENT_TIMER = 'swap_out_gradient'
 
 
 class OptimizerSwapper(object):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
         self.swap_config = swap_config
         self.aio_config = aio_config
 
         # NVMe swap management
         self.swap_params_info = {}
         self.swap_element_size = torch.tensor([], dtype=dtype).element_size()
-        self.swap_folder = os.path.join(base_folder,
-                                        'optimizer',
-                                        f'rank{dist.get_rank()}')
+        self.swap_folder = os.path.join(base_folder, 'optimizer', f'rank{dist.get_rank()}')
         os.makedirs(self.swap_folder, exist_ok=True)
 
         self.optimizer = optimizer
@@ -191,11 +176,7 @@ class OptimizerSwapper(object):
             self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
             self.timer_names.update(gradient_swapper.get_timer_names())
 
-    def _swap_out_gradients(self,
-                            parameter,
-                            gradient_offsets,
-                            gradient_tensors,
-                            gradient_swapper):
+    def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper):
         if not id(parameter) in self.swap_params_info.keys():
             return
 
@@ -205,10 +186,8 @@ class OptimizerSwapper(object):
         swappable_offsets = []
         swappable_lengths = []
 
-        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(
-            tensors=gradient_tensors,
-            offsets=gradient_offsets
-        )
+        aligned_gradients, aligned_offsets = self._adjust_for_misaligned_lengths(tensors=gradient_tensors,
+                                                                                 offsets=gradient_offsets)
 
         self._start_timer(SWAP_OUT_GRADIENT_TIMER)
         for tensor, offset in zip(aligned_gradients, aligned_offsets):
@@ -222,38 +201,26 @@ class OptimizerSwapper(object):
 
         if len(swappable_tensors) > 0:
             if not gradient_swapper.has_buffers():
-                pinned_buffers = self.swap_buffer_manager.allocate_all(
-                    num_elems=self.largest_numel,
-                    dtype=self.dtype)
+                pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
 
                 gradient_swapper.add_buffers(pinned_buffers)
 
-            swappable_paths = swap_info.get_or_create_gradient_paths(
-                swappable_offsets,
-                swappable_lengths)
+            swappable_paths = swap_info.get_or_create_gradient_paths(swappable_offsets, swappable_lengths)
 
-            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors,
-                                              path_list=swappable_paths)
+            gradient_swapper.swap_out_tensors(tensor_list=swappable_tensors, path_list=swappable_paths)
 
         self._stop_timer(SWAP_OUT_GRADIENT_TIMER)
         self.timer_names.add(SWAP_OUT_GRADIENT_TIMER)
 
-    def _initialize_from_swapped_fp16_params(self,
-                                             aio_handle,
-                                             fp16_partitions_info,
-                                             fp16_num_elems,
-                                             fp16_pinned_buffers,
-                                             fp32_parameters):
+    def _initialize_from_swapped_fp16_params(self, aio_handle, fp16_partitions_info, fp16_num_elems,
+                                             fp16_pinned_buffers, fp32_parameters):
         assert len(fp32_parameters) == len(fp16_partitions_info)
         assert len(fp32_parameters) == len(fp16_num_elems)
         assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers])
 
-        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters,
-                                               num_elems=fp16_num_elems)
+        fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters, num_elems=fp16_num_elems)
 
-        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(
-            num_elems=self.largest_numel,
-            dtype=self.dtype)
+        fp32_pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
 
         fp16_buffer_numel = [buf.numel() for buf in fp16_pinned_buffers]
         assert all([numel >= self.largest_numel for numel in fp16_buffer_numel]), \
@@ -264,11 +231,10 @@ class OptimizerSwapper(object):
 
         curr_index = 0
         while curr_index < len(fp32_parameters):
-            fp16_pinned_tensors = self._swap_in_fp16_params(
-                aio_handle=aio_handle,
-                fp16_num_elems=fp16_num_elems[curr_index:],
-                fp16_partitions_info=fp16_partitions_info[curr_index:],
-                fp16_swap_buffers=fp16_swap_buffers)
+            fp16_pinned_tensors = self._swap_in_fp16_params(aio_handle=aio_handle,
+                                                            fp16_num_elems=fp16_num_elems[curr_index:],
+                                                            fp16_partitions_info=fp16_partitions_info[curr_index:],
+                                                            fp16_swap_buffers=fp16_swap_buffers)
 
             if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE:
                 for i, tensor in enumerate(fp16_pinned_tensors):
@@ -277,11 +243,10 @@ class OptimizerSwapper(object):
                         f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
                     )
 
-            swap_out_count = self._swap_out_fp16_params(
-                aio_handle=aio_handle,
-                fp32_swap_paths=fp32_swap_paths[curr_index:],
-                fp32_swap_buffers=fp32_swap_buffers,
-                fp16_pinned_tensors=fp16_pinned_tensors)
+            swap_out_count = self._swap_out_fp16_params(aio_handle=aio_handle,
+                                                        fp32_swap_paths=fp32_swap_paths[curr_index:],
+                                                        fp32_swap_buffers=fp32_swap_buffers,
+                                                        fp16_pinned_tensors=fp16_pinned_tensors)
             assert swap_out_count == len(fp16_pinned_tensors), \
             f"{swap_out_count} does not match {len(fp16_pinned_tensors)}"
 
@@ -291,11 +256,7 @@ class OptimizerSwapper(object):
 
         self.swap_buffer_manager.free(fp32_pinned_buffers)
 
-    def _swap_in_fp16_params(self,
-                             aio_handle,
-                             fp16_num_elems,
-                             fp16_partitions_info,
-                             fp16_swap_buffers):
+    def _swap_in_fp16_params(self, aio_handle, fp16_num_elems, fp16_partitions_info, fp16_swap_buffers):
         assert len(fp16_num_elems) > 0
 
         swapped_fp16_tensors = []
@@ -330,11 +291,7 @@ class OptimizerSwapper(object):
 
         return swapped_fp16_tensors
 
-    def _swap_out_fp16_params(self,
-                              aio_handle,
-                              fp32_swap_paths,
-                              fp32_swap_buffers,
-                              fp16_pinned_tensors):
+    def _swap_out_fp16_params(self, aio_handle, fp32_swap_paths, fp32_swap_buffers, fp16_pinned_tensors):
 
         assert len(fp16_pinned_tensors) <= len(fp32_swap_paths)
         swap_out_count = 0
@@ -343,11 +300,8 @@ class OptimizerSwapper(object):
                 fp32_swap_buffers.swap_out(aio_handle)
                 fp32_swap_buffers.reset()
 
-            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(
-                fp16_tensor,
-                fp32_swap_paths[i],
-                self._io_aligned_numel(fp16_tensor.numel())
-                )
+            pinned_tensor, _ = fp32_swap_buffers.insert_tensor(fp16_tensor, fp32_swap_paths[i],
+                                                               self._io_aligned_numel(fp16_tensor.numel()))
             assert pinned_tensor is not None
             swap_out_count += 1
 
@@ -359,15 +313,12 @@ class OptimizerSwapper(object):
     def _initialize_parameters(self, parameters, src_tensors, aio_handle):
         assert len(parameters) == len(src_tensors)
 
-        swap_paths = self._get_swap_paths(parameters=parameters,
-                                          num_elems=[src.numel() for src in src_tensors])
+        swap_paths = self._get_swap_paths(parameters=parameters, num_elems=[src.numel() for src in src_tensors])
 
         SWAP_INIT_TIMER = "swap_init_write"
         self._start_timer(SWAP_INIT_TIMER)
 
-        pinned_buffers = self.swap_buffer_manager.allocate_all(
-            num_elems=self.largest_numel,
-            dtype=self.dtype)
+        pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
         assert pinned_buffers is not None
 
         self._swap_out_unpinned_tensors(aio_handle=aio_handle,
@@ -397,11 +348,7 @@ class OptimizerSwapper(object):
         swap_paths = [info.swap_paths[0] for info in swap_info_list]
         return swap_paths
 
-    def _swap_out_unpinned_tensors(self,
-                                   aio_handle,
-                                   unpinned_tensors,
-                                   dest_paths,
-                                   pinned_buffers):
+    def _swap_out_unpinned_tensors(self, aio_handle, unpinned_tensors, dest_paths, pinned_buffers):
 
         swap_buffer_count = len(pinned_buffers)
         unpinned_tensor_count = len(unpinned_tensors)
@@ -441,8 +388,7 @@ class OptimizerSwapper(object):
                 continue
 
             # Split into two by making remainder a tensor
-            aligned_length = (orig_tensor.numel() //
-                              self.numel_alignment) * self.numel_alignment
+            aligned_length = (orig_tensor.numel() // self.numel_alignment) * self.numel_alignment
             new_tensors.append(orig_tensor.narrow(0, 0, aligned_length))
             new_offsets.append(orig_offset)
 
@@ -489,10 +435,9 @@ class OptimizerSwapper(object):
         param_id = id(parameter)
         assert not param_id in self.swap_params_info
 
-        self.swap_params_info[param_id] = OptimizerStateSwapInfo(
-            parameter=parameter,
-            numel=numel,
-            base_folder=self.swap_folder)
+        self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter,
+                                                                 numel=numel,
+                                                                 base_folder=self.swap_folder)
         swap_info = self.swap_params_info[param_id]
 
         self._update_param_state_info(swap_info, parameter)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
index 515853529e0c797ac5603f95d1924526ee394753..677bc2aa4a8e816da7dc34079dd235dfe407d34f 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -25,30 +26,14 @@ SWAP_IN_GRADIENT_TIMER = 'swap_in_gradient'
 
 
 class PartitionedOptimizerSwapper(OptimizerSwapper):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
-        super(PartitionedOptimizerSwapper,
-              self).__init__(swap_config,
-                             aio_config,
-                             base_folder,
-                             optimizer,
-                             largest_numel,
-                             device,
-                             dtype,
-                             timers)
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
+        super(PartitionedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer,
+                                                          largest_numel, device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                            aio_config[AIO_QUEUE_DEPTH],
-                                            aio_config[AIO_SINGLE_SUBMIT],
-                                            aio_config[AIO_OVERLAP_EVENTS],
+        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                             aio_config[AIO_THREAD_COUNT])
 
         # Overlap swapping out
@@ -56,33 +41,21 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
                                                    numel_alignment=self.numel_alignment,
                                                    timers=self.timers)
 
-        self.print_exclude_list += [
-            'aio_handle',
-            'gradient_swapper',
-            'print_exclude_list'
-        ]
+        self.print_exclude_list += ['aio_handle', 'gradient_swapper', 'print_exclude_list']
 
         if dist.get_rank() == 0:
-            print_object(obj=self,
-                         name='PartitionedOptimizerSwapper',
-                         exclude_list=self.print_exclude_list)
+            print_object(obj=self, name='PartitionedOptimizerSwapper', exclude_list=self.print_exclude_list)
 
     def initialize_parameters(self, parameters, src_tensors):
-        self._initialize_parameters(parameters=parameters,
-                                    src_tensors=src_tensors,
-                                    aio_handle=self.aio_handle)
-
-    def initialize_from_swapped_fp16_params(self,
-                                            fp16_partitions_info,
-                                            fp16_num_elems,
-                                            fp16_pinned_buffers,
+        self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.aio_handle)
+
+    def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers,
                                             fp32_parameters):
-        self._initialize_from_swapped_fp16_params(
-            aio_handle=self.aio_handle,
-            fp16_partitions_info=fp16_partitions_info,
-            fp16_num_elems=fp16_num_elems,
-            fp16_pinned_buffers=fp16_pinned_buffers,
-            fp32_parameters=fp32_parameters)
+        self._initialize_from_swapped_fp16_params(aio_handle=self.aio_handle,
+                                                  fp16_partitions_info=fp16_partitions_info,
+                                                  fp16_num_elems=fp16_num_elems,
+                                                  fp16_pinned_buffers=fp16_pinned_buffers,
+                                                  fp32_parameters=fp32_parameters)
 
     def flush_gradients(self):
         self._flush_gradient_swapper(self.gradient_swapper)
@@ -94,8 +67,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
 
         self._flush_gradient_swapper(self.gradient_swapper)
 
-        required_buffer_count = len(
-            swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
+        required_buffer_count = len(swap_info.tensors) + (1 if swap_info.has_gradients() else 0)
         aligned_numel = self._io_aligned_numel(swap_info.numel())
         pinned_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
                                                            count=required_buffer_count,
@@ -111,9 +83,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
         self.timer_names.add(SWAP_IN_PARAM_TIMER)
 
         self._start_timer(SWAP_IN_GRADIENT_TIMER)
-        self._swap_in_gradients(aio_handle=self.aio_handle,
-                                parameter=parameter,
-                                dest_buffer=pinned_buffers[-1])
+        self._swap_in_gradients(aio_handle=self.aio_handle, parameter=parameter, dest_buffer=pinned_buffers[-1])
         self._stop_timer(SWAP_IN_GRADIENT_TIMER)
         self.timer_names.add(SWAP_IN_GRADIENT_TIMER)
 
@@ -125,10 +95,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
 
         self._start_timer(SWAP_OUT_PARAM_TIMER)
         pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths = self._separate_pinned_tensors(swap_info)
-        swap_bytes = sum([
-            self._io_aligned_numel(t.numel()) * t.element_size()
-            for t in swap_info.tensors
-        ])
+        swap_bytes = sum([self._io_aligned_numel(t.numel()) * t.element_size() for t in swap_info.tensors])
 
         WRITE_TIMER = 'swap_submit_write'
         self._start_timer(WRITE_TIMER)
@@ -139,9 +106,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
             t.data = torch.Tensor()
 
         if len(unpinned_tensors) > 0:
-            pinned_buffers = self.swap_buffer_manager.allocate_all(
-                num_elems=self.largest_numel,
-                dtype=self.dtype)
+            pinned_buffers = self.swap_buffer_manager.allocate_all(num_elems=self.largest_numel, dtype=self.dtype)
             self._swap_out_unpinned_tensors(aio_handle=self.aio_handle,
                                             unpinned_tensors=unpinned_tensors,
                                             dest_paths=unpinned_paths,
@@ -176,8 +141,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
 
         assert len(swap_info.tensors) <= len(dest_buffers)
 
-        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(
-            swap_info.tensors)
+        swap_lengths = [self._io_aligned_numel(swap_info.numel())] * len(swap_info.tensors)
         swap_buffers = get_sized_buffers(dest_buffers, swap_lengths)
 
         READ_TIMER = 'swap_submit_read_param'
@@ -187,8 +151,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
         swap_in_tensors(aio_handle, swap_buffers, swap_info.swap_paths)
         self._stop_timer(READ_TIMER)
 
-        swap_bytes = sum(
-            [buffer.numel() * buffer.element_size() for buffer in swap_buffers])
+        swap_bytes = sum([buffer.numel() * buffer.element_size() for buffer in swap_buffers])
 
         self._start_timer(WAIT_TIMER)
         aio_handle.wait()
@@ -223,11 +186,7 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
     def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
         swap_info = self.swap_params_info[id(parameter)]
         param_gradients = swap_info.swapped_gradients.values()
-        swap_buffers = [
-            gradient_tensor.narrow(0,
-                                   grad.offset,
-                                   grad.length) for grad in param_gradients
-        ]
+        swap_buffers = [gradient_tensor.narrow(0, grad.offset, grad.length) for grad in param_gradients]
         swap_paths = [grad.path for grad in param_gradients]
         SWAP_READ_GRADIENTS = 'swap_submit_read_gradient'
         SWAP_WAIT_GRADIENTS = 'swap_submit_wait_gradient'
@@ -256,5 +215,4 @@ class PartitionedOptimizerSwapper(OptimizerSwapper):
             self._swap_in_pinned_gradients(aio_handle, parameter, parameter.grad)
 
         if swap_info.unswapped_gradients:
-            self._retrieve_unswapped_grad_partitions(swap_info=swap_info,
-                                                     dest_buffer=parameter.grad)
+            self._retrieve_unswapped_grad_partitions(swap_info=swap_info, dest_buffer=parameter.grad)
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index e9e134e6779aea1511439df38010c8ec9f927d19..4109e095414875f217d12efc76f16b72303a074a 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -33,6 +34,7 @@ class PartitionedParamStatus(Enum):
 
 
 class AsyncPartitionedParameterSwapper(object):
+
     def __init__(self, ds_config, model_dtype):
 
         aio_op = AsyncIOBuilder().load(verbose=False)
@@ -74,9 +76,7 @@ class AsyncPartitionedParameterSwapper(object):
 
         if dist.get_rank() == 0:
             exclude_list = ['aio_read_handle', 'aio_write_handle', 'buffers']
-            print_object(obj=self,
-                         name='AsyncPartitionedParameterSwapper',
-                         exclude_list=exclude_list)
+            print_object(obj=self, name='AsyncPartitionedParameterSwapper', exclude_list=exclude_list)
 
     def available_swap_in_buffers(self):
         return len(self.available_buffer_ids)
@@ -84,9 +84,7 @@ class AsyncPartitionedParameterSwapper(object):
     def _configure_aio(self, ds_config):
         self.swap_config = ds_config.zero_config.offload_param
         torch_dtype_string = str(self.dtype).split(".")[1]
-        self.swap_folder = os.path.join(self.swap_config.nvme_path,
-                                        'zero_stage_3',
-                                        f'{torch_dtype_string}params',
+        self.swap_folder = os.path.join(self.swap_config.nvme_path, 'zero_stage_3', f'{torch_dtype_string}params',
                                         f'rank{dist.get_rank()}')
         shutil.rmtree(self.swap_folder, ignore_errors=True)
         os.makedirs(self.swap_folder, exist_ok=True)
@@ -101,8 +99,7 @@ class AsyncPartitionedParameterSwapper(object):
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         self.elements_per_buffer = self.swap_config.buffer_size
-        self.aligned_elements_per_buffer = self._io_aligned_numel(
-            self.elements_per_buffer)
+        self.aligned_elements_per_buffer = self._io_aligned_numel(self.elements_per_buffer)
         self.param_buffer_count = self.swap_config.buffer_count
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
@@ -112,17 +109,13 @@ class AsyncPartitionedParameterSwapper(object):
                         dtype=self.dtype,
                         requires_grad=False))
 
-        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
-                                               self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT],
-                                               self.aio_config[AIO_OVERLAP_EVENTS],
+        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
+                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
                                                self.aio_config[AIO_THREAD_COUNT])
 
-        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE],
-                                                self.aio_config[AIO_QUEUE_DEPTH],
+        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
                                                 self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS],
-                                                self.aio_config[AIO_THREAD_COUNT])
+                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
 
         self.swap_out_params = []
 
@@ -147,8 +140,7 @@ class AsyncPartitionedParameterSwapper(object):
                 param_path = self.id_to_path[param_id]
             else:
                 assert not must_exist, f"Path for param id {param_id} does not exist"
-                param_path = os.path.join(self.swap_folder,
-                                          f'{param_id}_param.tensor.swp')
+                param_path = os.path.join(self.swap_folder, f'{param_id}_param.tensor.swp')
 
                 self.id_to_path[param_id] = param_path
             paths.append(param_path)
@@ -177,18 +169,16 @@ class AsyncPartitionedParameterSwapper(object):
         for param in params:
             param_id = param.ds_id
             assert param_id in self.param_id_to_numel.keys(), f" Number of elements in param {param_id} is unknown"
-            assert param_id not in self.param_id_to_buffer_id.keys(), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
-            assert param_id not in self.param_id_to_swap_buffer.keys(), f"param {param_id} has already been assigned a swap buffer"
+            assert param_id not in self.param_id_to_buffer_id.keys(
+            ), f"param {param_id} already assigned swap buffer id {self.param_id_to_buffer_id[param_id]}"
+            assert param_id not in self.param_id_to_swap_buffer.keys(
+            ), f"param {param_id} has already been assigned a swap buffer"
 
             buffer_id = self.available_buffer_ids.pop()
-            print_rank_0(
-                f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
+            print_rank_0(f"param {param.ds_id} is assigned swap in buffer id {buffer_id}  ")
             self.param_id_to_buffer_id[param_id] = buffer_id
             aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id])
-            swap_buffer = self.buffers.narrow(
-                0,
-                int(buffer_id * self.aligned_elements_per_buffer),
-                aligned_swap_numel)
+            swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel)
 
             self.param_id_to_swap_buffer[param_id] = swap_buffer
             compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
@@ -217,9 +207,7 @@ class AsyncPartitionedParameterSwapper(object):
 
         for param, swap_in_buffer in zip(self.inflight_params, self.inflight_swap_in_buffers):
             param_id = param.ds_id
-            compute_buffer = swap_in_buffer.narrow(0,
-                                                   0,
-                                                   self.param_id_to_numel[param_id])
+            compute_buffer = swap_in_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
             param.ds_tensor.data = compute_buffer.data
             param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
 
@@ -289,7 +277,8 @@ class AsyncPartitionedParameterSwapper(object):
     #assigns an in memory buffer and swaps in from nvme
     def swap_in(self, params, async_op=True, swap_in_buffers=None):
 
-        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE for param in params]), "Some params are already available or in flight"
+        assert all([param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE
+                    for param in params]), "Some params are already available or in flight"
         swap_in_paths = self._get_swap_paths(params)
 
         if swap_in_buffers is None:
@@ -305,7 +294,9 @@ class AsyncPartitionedParameterSwapper(object):
                     f'Num available params: count = {len(self.available_params)}, ids = {self.available_params}, numel = {self.available_numel}',
                     force=True)
 
-            assert len(swap_in_paths) <= len(self.available_buffer_ids), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
+            assert len(swap_in_paths) <= len(
+                self.available_buffer_ids
+            ), f"Not enough buffers {len(self.available_buffer_ids)} for swapping {len(swap_in_paths)}"
             compute_buffers, swap_in_buffers = self._allocate_and_return_buffers_for_swap_in(params)
             inflight_numel = sum([t.numel() for t in compute_buffers])
         else:
@@ -322,8 +313,7 @@ class AsyncPartitionedParameterSwapper(object):
     def swap_into_buffer(self, param, dest_buffer):
         assert param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE, f"param {param.ds_id} is already available or inflight"
 
-        require_swap_buffer = not (dest_buffer.is_pinned()
-                                   and self._is_io_aligned(dest_buffer.numel()))
+        require_swap_buffer = not (dest_buffer.is_pinned() and self._is_io_aligned(dest_buffer.numel()))
 
         if require_swap_buffer:
             assert len(self.available_buffer_ids) > 0, f"No buffer available to swap param {param.ds_id}."
@@ -348,17 +338,15 @@ class AsyncPartitionedParameterSwapper(object):
     def get_buffer(self, param, numel):
         param_id = param.ds_id
 
-        assert self.available_swap_in_buffers() > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}"
+        assert self.available_swap_in_buffers(
+        ) > 0, f"No swap buffers to allocate for fp16 param {param_id} of numel = {numel}"
         assert numel < self.elements_per_buffer, f"More elements {numel} than buffer size {self.elements_per_buffer}"
 
         self.param_id_to_numel[param_id] = numel
         buffer_id = self.available_buffer_ids.pop()
         self.param_id_to_buffer_id[param_id] = buffer_id
         aligned_swap_numel = self._io_aligned_numel(self.param_id_to_numel[param_id])
-        swap_buffer = self.buffers.narrow(
-            0,
-            int(buffer_id * self.aligned_elements_per_buffer),
-            aligned_swap_numel)
+        swap_buffer = self.buffers.narrow(0, int(buffer_id * self.aligned_elements_per_buffer), aligned_swap_numel)
 
         self.param_id_to_swap_buffer[param_id] = swap_buffer
         compute_buffer = swap_buffer.narrow(0, 0, self.param_id_to_numel[param_id])
@@ -369,8 +357,7 @@ class AsyncPartitionedParameterSwapper(object):
         buffers = []
         for id in self.available_buffer_ids:
             buffers.append(
-                self.buffers.narrow(0,
-                                    int(id * self.aligned_elements_per_buffer),
+                self.buffers.narrow(0, int(id * self.aligned_elements_per_buffer),
                                     int(self.aligned_elements_per_buffer)))
             self.reserved_buffer_ids.append(id)
 
@@ -390,12 +377,9 @@ class AsyncPartitionedParameterSwapper(object):
         return (numel % self.numel_alignment) == 0
 
     def reserve_partitioned_swap_space(self, partition_num_elems):
-        aligned_numel = sum(
-            [self._io_aligned_numel(numel) for numel in partition_num_elems])
+        aligned_numel = sum([self._io_aligned_numel(numel) for numel in partition_num_elems])
         self.partitioned_swap_buffer = get_accelerator().pin_memory(
-            torch.zeros(aligned_numel,
-                        device='cpu',
-                        dtype=self.dtype))
+            torch.zeros(aligned_numel, device='cpu', dtype=self.dtype))
         self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer])
 
     def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params):
@@ -408,11 +392,8 @@ class AsyncPartitionedParameterSwapper(object):
         self.synchronize_writes()
         self.partitioned_swap_pool.reset()
         for i, fp32_tensor in enumerate(src_fp32_params):
-            swap_tensor, _ = self.partitioned_swap_pool.insert_tensor(
-                fp32_tensor,
-                fp16_swap_paths[i],
-                self._io_aligned_numel(fp32_tensor.numel())
-            )
+            swap_tensor, _ = self.partitioned_swap_pool.insert_tensor(fp32_tensor, fp16_swap_paths[i],
+                                                                      self._io_aligned_numel(fp32_tensor.numel()))
             assert swap_tensor is not None
             dst_fp16_params[i].ds_tensor.status = PartitionedParamStatus.AVAILABLE
 
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
index 4e101528cd231224d5431cc3e43be4900d293770..cb00e3dc2fad233ba994a045ca436f839f63053c 100644
--- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
@@ -16,13 +17,8 @@ from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
 
 
 class OptimizerSwapOp(object):
-    def __init__(self,
-                 aio_handle,
-                 read_op,
-                 param_info,
-                 allocated_buffers,
-                 state_buffers,
-                 num_ops):
+
+    def __init__(self, aio_handle, read_op, param_info, allocated_buffers, state_buffers, num_ops):
         self.aio_handle = aio_handle
         self.read_op = read_op
         self.param_info = param_info
@@ -53,36 +49,18 @@ ASYNC_SWAP_OUT_STATE_TIMER = 'async_swap_out_state'
 
 
 class PipelinedOptimizerSwapper(OptimizerSwapper):
-    def __init__(self,
-                 swap_config,
-                 aio_config,
-                 base_folder,
-                 optimizer,
-                 largest_numel,
-                 device,
-                 dtype,
-                 timers):
-        super(PipelinedOptimizerSwapper,
-              self).__init__(swap_config,
-                             aio_config,
-                             base_folder,
-                             optimizer,
-                             largest_numel,
-                             device,
-                             dtype,
-                             timers)
+
+    def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
+        super(PipelinedOptimizerSwapper, self).__init__(swap_config, aio_config, base_folder, optimizer, largest_numel,
+                                                        device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                                  aio_config[AIO_QUEUE_DEPTH],
-                                                  aio_config[AIO_SINGLE_SUBMIT],
-                                                  aio_config[AIO_OVERLAP_EVENTS],
+        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                                  aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                                   aio_config[AIO_THREAD_COUNT])
 
-        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE],
-                                                 aio_config[AIO_QUEUE_DEPTH],
-                                                 aio_config[AIO_SINGLE_SUBMIT],
-                                                 aio_config[AIO_OVERLAP_EVENTS],
+        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
+                                                 aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
                                                  aio_config[AIO_THREAD_COUNT])
 
         # Overlap gradient swap out
@@ -93,42 +71,25 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
         self.async_swap_in = swap_config.pipeline_read
         self.async_swap_out = swap_config.pipeline_write
 
-        self.swap_ops = {
-            SYNC_SWAP_IN: None,
-            ASYNC_SWAP_IN: None,
-            SYNC_SWAP_OUT: None,
-            ASYNC_SWAP_OUT: None
-        }
+        self.swap_ops = {SYNC_SWAP_IN: None, ASYNC_SWAP_IN: None, SYNC_SWAP_OUT: None, ASYNC_SWAP_OUT: None}
 
         self.print_exclude_list += [
-            'gradient_swapper',
-            'read_aio_handle',
-            'write_aio_handle',
-            'swap_ops',
-            'print_exclude_list'
+            'gradient_swapper', 'read_aio_handle', 'write_aio_handle', 'swap_ops', 'print_exclude_list'
         ]
 
         if dist.get_rank() == 0:
-            print_object(obj=self,
-                         name='PipelinedOptimizerSwapper',
-                         exclude_list=self.print_exclude_list)
+            print_object(obj=self, name='PipelinedOptimizerSwapper', exclude_list=self.print_exclude_list)
 
     def initialize_parameters(self, parameters, src_tensors):
-        self._initialize_parameters(parameters=parameters,
-                                    src_tensors=src_tensors,
-                                    aio_handle=self.write_aio_handle)
-
-    def initialize_from_swapped_fp16_params(self,
-                                            fp16_partitions_info,
-                                            fp16_num_elems,
-                                            fp16_pinned_buffers,
+        self._initialize_parameters(parameters=parameters, src_tensors=src_tensors, aio_handle=self.write_aio_handle)
+
+    def initialize_from_swapped_fp16_params(self, fp16_partitions_info, fp16_num_elems, fp16_pinned_buffers,
                                             fp32_parameters):
-        self._initialize_from_swapped_fp16_params(
-            aio_handle=self.write_aio_handle,
-            fp16_partitions_info=fp16_partitions_info,
-            fp16_num_elems=fp16_num_elems,
-            fp16_pinned_buffers=fp16_pinned_buffers,
-            fp32_parameters=fp32_parameters)
+        self._initialize_from_swapped_fp16_params(aio_handle=self.write_aio_handle,
+                                                  fp16_partitions_info=fp16_partitions_info,
+                                                  fp16_num_elems=fp16_num_elems,
+                                                  fp16_pinned_buffers=fp16_pinned_buffers,
+                                                  fp32_parameters=fp32_parameters)
 
     def flush_gradients(self):
         self._flush_gradient_swapper(self.gradient_swapper)
@@ -146,18 +107,16 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
             self.swap_ops[SYNC_SWAP_IN] = self.swap_ops[ASYNC_SWAP_IN]
             self.swap_ops[ASYNC_SWAP_IN] = None
         else:
-            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(
-                aio_handle=self.read_aio_handle,
-                parameter=parameter)
+            self.swap_ops[SYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle,
+                                                                        parameter=parameter)
 
         if self.swap_ops[SYNC_SWAP_IN]:
             self.swap_ops[SYNC_SWAP_IN].wait()
 
         if self.async_swap_in and async_parameter is not None:
             assert self.swap_ops[ASYNC_SWAP_IN] is None
-            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(
-                aio_handle=self.read_aio_handle,
-                parameter=async_parameter)
+            self.swap_ops[ASYNC_SWAP_IN] = self._swap_in_optimizer_state(aio_handle=self.read_aio_handle,
+                                                                         parameter=async_parameter)
 
         self._stop_timer(SWAP_IN_STATE_TIMER)
         self.timer_names.add(SWAP_IN_STATE_TIMER)
@@ -209,10 +168,9 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
         unpinned_tensors = param_info.get_unpinned_state_tensors()
 
         if len(unpinned_tensors) > 0:
-            new_alloc_buffers = self.swap_buffer_manager.allocate(
-                num_elems=self._io_aligned_numel(param_info.numel()),
-                count=len(unpinned_tensors),
-                dtype=param_info.dtype())
+            new_alloc_buffers = self.swap_buffer_manager.allocate(num_elems=self._io_aligned_numel(param_info.numel()),
+                                                                  count=len(unpinned_tensors),
+                                                                  dtype=param_info.dtype())
             assert new_alloc_buffers is not None
 
             allocated_buffers += new_alloc_buffers
@@ -241,13 +199,11 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
         if param_info is None:
             return None
 
-        required_buffer_count = len(
-            param_info.tensors) + (1 if param_info.has_gradients() else 0)
+        required_buffer_count = len(param_info.tensors) + (1 if param_info.has_gradients() else 0)
         aligned_numel = self._io_aligned_numel(param_info.numel())
-        allocated_buffers = self.swap_buffer_manager.allocate(
-            num_elems=aligned_numel,
-            count=required_buffer_count,
-            dtype=parameter.dtype)
+        allocated_buffers = self.swap_buffer_manager.allocate(num_elems=aligned_numel,
+                                                              count=required_buffer_count,
+                                                              dtype=parameter.dtype)
         assert allocated_buffers is not None, \
         f"PipelinedOptimizerSwapper ran out of swap buffers, try increasing 'buffer_count'"
 
@@ -266,8 +222,7 @@ class PipelinedOptimizerSwapper(OptimizerSwapper):
         swap_in_tensors(aio_handle, swap_buffers, swap_paths)
 
         if param_info.unswapped_gradients:
-            self._retrieve_unswapped_grad_partitions(swap_info=param_info,
-                                                     dest_buffer=parameter.grad)
+            self._retrieve_unswapped_grad_partitions(swap_info=param_info, dest_buffer=parameter.grad)
 
         swap_in_op = OptimizerSwapOp(aio_handle=aio_handle,
                                      param_info=param_info,
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 1d236bd7fcacbb3f6fc7d381475f42e2051ed058..50a88f74351aa82d365964331bb32c2959f238a1 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -1,7 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Functionality of swapping tensors to/from (NVMe) storage devices.
 """
 
@@ -34,6 +35,7 @@ def print_object(obj, name, exclude_list=[]):
 
 
 class SwapBuffer(object):
+
     def __init__(self, buffer):
         self.buffer = buffer
         self.reset()
@@ -92,6 +94,7 @@ class SwapBuffer(object):
 
 
 class SwapBufferPool(object):
+
     def __init__(self, buffers):
         assert all([buf.is_pinned() for buf in buffers])
         self.buffers = [SwapBuffer(buf) for buf in buffers]
@@ -175,20 +178,17 @@ class SwapBufferPool(object):
 
 
 class SwapBufferManager(object):
+
     def __init__(self, num_elems, count, dtype):
         self.num_elems = num_elems
         self.count = count
         self.dtype = dtype
         self.all_buffers = [
-            get_accelerator().pin_memory(
-                torch.zeros(num_elems,
-                            device='cpu',
-                            dtype=dtype)) for _ in range(count)
+            get_accelerator().pin_memory(torch.zeros(num_elems, device='cpu', dtype=dtype)) for _ in range(count)
         ]
         self.free_buffer_index = [i for i in range(count)]
         self.used_buffer_index = {}
-        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024
-                                                                                     **3)
+        self.gigabytes = (self.all_buffers[0].element_size() * num_elems * count) / (1024**3)
 
         if dist.get_rank() == 0:
             exclude_list = ['all_buffers']
@@ -211,9 +211,7 @@ class SwapBufferManager(object):
         return buffers
 
     def allocate_all(self, num_elems, dtype):
-        return self.allocate(num_elems=num_elems,
-                             count=len(self.free_buffer_index),
-                             dtype=dtype)
+        return self.allocate(num_elems=num_elems, count=len(self.free_buffer_index), dtype=dtype)
 
     def free(self, buffers):
         buffer_ids = []
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 30dad84b16d1eb2b789fad5f539d908397e6e1c1..ffb09677f04684e142e88fe4a41922826eea4b7a 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -1,10 +1,12 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 Copyright NVIDIA/Megatron
 
 Helper functions and classes from multiple sources.
-'''
+"""
 
 from collections.abc import Iterable
 from deepspeed.moe.utils import is_moe_param
@@ -19,15 +21,18 @@ import torch
 from deepspeed import comm as dist
 
 try:
-    from torch._six import inf as inf
+    from torch._six import inf
 except ModuleNotFoundError:
-    from torch import inf as inf
+    from torch import inf
 
 from deepspeed.utils import groups, logger
 from deepspeed.runtime.constants import PIPE_REPLICATED
 from numpy import prod
 from deepspeed.accelerator import get_accelerator
 
+from deepspeed.module_inject.policy import transpose
+from torch.nn import functional as F
+
 torch_memory_reserved = get_accelerator().memory_reserved
 torch_max_memory_reserved = get_accelerator().max_memory_reserved
 
@@ -37,6 +42,7 @@ class DummyOptim():
     Dummy optimizer presents model parameters as a param group, this is
     primarily used to allow ZeRO-3 without an optimizer
     """
+
     def __init__(self, params):
         self.param_groups = []
         self.param_groups.append({'params': params})
@@ -169,11 +175,8 @@ def move_to_device(item, device, criterion_func):
 
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
-    def __init__(self,
-                 param_groups=None,
-                 mpu=None,
-                 zero_reduce_scatter=False,
-                 deepspeed=None):
+
+    def __init__(self, param_groups=None, mpu=None, zero_reduce_scatter=False, deepspeed=None):
         self.mpu = mpu
         self.params = [] if param_groups else None
         self.zero_reduce_scatter = zero_reduce_scatter
@@ -196,13 +199,9 @@ class CheckOverflow(object):
             # an overflow due to expert weights, we detect it
 
             # Only need to check groups.get_largest_expert_parallel_group()
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=groups._get_max_expert_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
         if self.mpu is not None:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.mpu.get_model_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif reduce_overflow:
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX)
             dist.barrier()
@@ -247,31 +246,18 @@ class CheckOverflow(object):
         if has_moe_params:
             # All reduce this across expert_parallel_group, so that if an expert
             # overflows, we detect it here
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=groups._get_max_expert_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group())
         if self.zero_reduce_scatter:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=dist.get_world_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
         elif self.mpu is not None:
             if self.deepspeed is not None:
-                using_pipeline = hasattr(self.deepspeed,
-                                         'pipeline_enable_backward_allreduce')
-                if (using_pipeline
-                        and self.deepspeed.pipeline_enable_backward_allreduce is False
-                    ) or (not using_pipeline
-                          and self.deepspeed.enable_backward_allreduce is False):
-                    dist.all_reduce(overflow_gpu,
-                                    op=dist.ReduceOp.MAX,
-                                    group=self.mpu.get_data_parallel_group())
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.mpu.get_model_parallel_group())
+                using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or (
+                        not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
+                    dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=dist.get_world_group())
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=dist.get_world_group())
 
         overflow = overflow_gpu[0].item()
         return bool(overflow)
@@ -308,9 +294,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def get_global_norm(norm_list):
@@ -319,6 +303,7 @@ def get_global_norm(norm_list):
     total_norm = 0.0
     for norm in norm_list:
         total_norm += norm**2.0
+    # logger.info(f'norm_list = {norm_list} global = {sqrt(total_norm)}')
     return sqrt(total_norm)
 
 
@@ -353,16 +338,13 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0
         for p in parameters:
             if mpu is not None:
-                if (mpu.get_model_parallel_rank()
-                        == 0) or is_model_parallel_parameter(p):
+                if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p):
                     param_norm = p.grad.data.norm(norm_type)
                     total_norm += param_norm.item()**norm_type
             else:
@@ -372,9 +354,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
     # Need to average total_norm across different GPUs due to the presence of moe params
@@ -419,9 +399,7 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -442,13 +420,10 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
@@ -488,9 +463,7 @@ def get_grad_zeros(parameters, mpu=None):
     # Sum across all model parallel GPUs.
     total_zeros_cuda = get_accelerator().FloatTensor([float(total_zeros)])
     if mpu is not None:
-        dist.all_reduce(total_zeros_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=mpu.get_model_parallel_group())
+        dist.all_reduce(total_zeros_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
     total_zeros = total_zeros_cuda[0].item()
 
     return total_zeros
@@ -522,9 +495,7 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
@@ -545,13 +516,10 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
@@ -603,11 +571,7 @@ def _lprobe(weights, num_parts, bottleneck):
             step += chunksize
 
         # Find the end index of partition p
-        parts[p] = bisect_left(weights,
-                               bsum,
-                               lo=step - chunksize,
-                               hi=min(step,
-                                      num_items))
+        parts[p] = bisect_left(weights, bsum, lo=step - chunksize, hi=min(step, num_items))
         # Nothing more to partition, return early
         if parts[p] == num_items:
             # See if the current partition is overweight.
@@ -655,6 +619,7 @@ def partition_balanced(weights, num_parts, eps=1e-3):
 
 
 class PartitionedTensor:
+
     def __init__(self, tensor, group, partition_meta=None):
         super().__init__()
 
@@ -696,10 +661,7 @@ class PartitionedTensor:
         partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
         start = partition[self.rank]
         length = partition[self.rank + 1] - start
-        tensor_part = tensor.detach().contiguous().view(-1).narrow(
-            0,
-            start=start,
-            length=length).clone()
+        tensor_part = tensor.detach().contiguous().view(-1).narrow(0, start=start, length=length).clone()
 
         return tensor_part, partition
 
@@ -709,9 +671,7 @@ class PartitionedTensor:
 
         # Allocate the full tensor as a flat buffer.
         full_numel = prod(self.full_size())
-        flat_tensor = torch.zeros([full_numel],
-                                  dtype=self.local_data.dtype,
-                                  device=device)
+        flat_tensor = torch.zeros([full_numel], dtype=self.local_data.dtype, device=device)
 
         # Prepare all-gather buffer
         partition_tensors = []
@@ -723,9 +683,7 @@ class PartitionedTensor:
             partition_tensors.append(buf)
 
         # Collect the full tensor
-        dist.all_gather(partition_tensors,
-                        partition_tensors[self.rank],
-                        group=self.group)
+        dist.all_gather(partition_tensors, partition_tensors[self.rank], group=self.group)
 
         for i in range(len(partition_tensors)):
             partition_tensors[i].data = torch.zeros(1)
@@ -798,12 +756,9 @@ def memory_status(msg, print_rank=-1, reset_max=False):
     max_cached /= 1024**3
 
     print(
-        f'RANK={rank} MEMSTATS',
-        msg,
-        f'device={get_accelerator().current_device_name()} '
+        f'RANK={rank} MEMSTATS', msg, f'device={get_accelerator().current_device_name()} '
         f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
-        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
-    )
+        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)')
 
 
 def get_ma_status():
@@ -814,6 +769,7 @@ def get_ma_status():
 
 def empty_cache():
     get_accelerator().empty_cache()
+    get_accelerator().reset_peak_memory_stats()
 
 
 def see_memory_usage(message, force=False):
@@ -827,16 +783,14 @@ def see_memory_usage(message, force=False):
 
     # Print message except when distributed but not rank 0
     logger.info(message)
-    logger.info(
-        f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
+    logger.info(f"MA {round(get_accelerator().memory_allocated() / (1024 * 1024 * 1024),2 )} GB \
         Max_MA {round(get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024),2)} GB \
         CA {round(torch_memory_reserved() / (1024 * 1024 * 1024),2)} GB \
         Max_CA {round(torch_max_memory_reserved() / (1024 * 1024 * 1024))} GB ")
 
     vm_stats = psutil.virtual_memory()
     used_GB = round(((vm_stats.total - vm_stats.available) / (1024**3)), 2)
-    logger.info(
-        f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
+    logger.info(f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')
 
     # get the peak memory to report correct data, so reset the counter for the next call
     get_accelerator().reset_peak_memory_stats()
@@ -915,32 +869,22 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
         total_norm = max(t.data.abs().max() for t in input_tensors)
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
             total_norm = total_norm_cuda[0].item()
     else:
-        total_norm = sum(
-            [t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
+        total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=mpu.get_model_parallel_group())
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-    if total_norm == float(
-            'inf') or total_norm == -float('inf') or total_norm != total_norm:
+    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
         total_norm = -1
 
     return total_norm
 
 
-def clip_tensors_by_global_norm(input_tensors,
-                                max_norm=1.0,
-                                global_norm=None,
-                                mpu=None,
-                                eps=1e-6):
+def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6):
     """Clip list of tensors by global norm.
     Args:
         input_tensors: List of tensors to be clipped
@@ -968,9 +912,7 @@ def align_dense_tensors(tensor_list, alignment):
 
     if remaining:
         elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
+        pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype)
         padded_tensor_list = tensor_list + [pad_tensor]
     else:
         padded_tensor_list = tensor_list
@@ -978,19 +920,13 @@ def align_dense_tensors(tensor_list, alignment):
     return padded_tensor_list
 
 
-def all_gather_dp_groups(partitioned_param_groups,
-                         dp_process_group,
-                         start_alignment_factor,
-                         allgather_bucket_size):
+def all_gather_dp_groups(partitioned_param_groups, dp_process_group, start_alignment_factor, allgather_bucket_size):
     for group_id, partitioned_params in enumerate(partitioned_param_groups):
         # Sequential AllGather Best of both worlds
         partition_id = dist.get_rank(group=dp_process_group[group_id])
         dp_world_size = dist.get_world_size(group=dp_process_group[group_id])
 
-        num_shards = max(
-            1,
-            partitioned_params[partition_id].numel() * dp_world_size //
-            allgather_bucket_size)
+        num_shards = max(1, partitioned_params[partition_id].numel() * dp_world_size // allgather_bucket_size)
 
         shard_size = partitioned_params[partition_id].numel() // num_shards
 
@@ -1004,16 +940,36 @@ def all_gather_dp_groups(partitioned_param_groups,
         for shard_id in range(num_shards):
 
             if shard_id == (num_shards - 1):
-                num_elements = partitioned_params[partition_id].numel(
-                ) - shard_id * shard_size
+                num_elements = partitioned_params[partition_id].numel() - shard_id * shard_size
 
             shard_list = []
             for dp_id in range(dp_world_size):
-                curr_shard = partitioned_params[dp_id].narrow(0,
-                                                              shard_id * shard_size,
-                                                              num_elements).detach()
+                curr_shard = partitioned_params[dp_id].narrow(0, shard_id * shard_size, num_elements).detach()
                 shard_list.append(curr_shard)
 
-            dist.all_gather(shard_list,
-                            shard_list[partition_id],
-                            dp_process_group[group_id])
+            dist.all_gather(shard_list, shard_list[partition_id], dp_process_group[group_id])
+
+
+class TLinear(torch.nn.Linear):
+
+    def __init__(self, orig_layer, name=""):
+        self.name = name
+        super().__init__(orig_layer.weight.shape[1], orig_layer.weight.shape[0], bias=(orig_layer.bias is not None))
+        self.weight.data = transpose(orig_layer.weight.data)
+        self.bias = orig_layer.bias
+        self._fwd_func = self._fwd_bias_add if self.bias is not None else self._fwd
+
+    def _fwd(self, input):
+        return F.linear(input, self.weight)
+
+    def _fwd_bias_add(self, input):
+        return F.linear(input, self.weight, bias=self.bias)
+
+    def forward(self, input):
+        return self._fwd_func(input)
+
+
+def get_inactive_params(param_list):
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    return [param for param in param_list if (hasattr(param, 'ds_id') and \
+                            param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
diff --git a/deepspeed/runtime/weight_quantizer.py b/deepspeed/runtime/weight_quantizer.py
index f88a9310ba32497867ab9d1e05e2c4485a01204f..95d737614e594ee200d13617654714d6ae9d26f6 100644
--- a/deepspeed/runtime/weight_quantizer.py
+++ b/deepspeed/runtime/weight_quantizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from ..module_inject.replace_policy import HFBertLayerPolicy, replace_policies
@@ -6,6 +9,7 @@ from deepspeed.accelerator import get_accelerator
 
 
 class WeightQuantization(object):
+
     def __init__(self, mlp_extra_grouping=True, mp_size=1):
         self.dense_scales = []
         self.qkv_scales = []
@@ -20,8 +24,7 @@ class WeightQuantization(object):
         data_scale = [float(1 << quantize_bits) / (2 * mx + 1e-5) for mx in max_d]
         data_int = [(g * s) for g, s in zip(data_groups, data_scale)]
         data_int = [
-            di.round().clamp(-(1 << (quantize_bits - 1)),
-                             (((1 << (quantize_bits - 1)) - 1))) for di in data_int
+            di.round().clamp(-(1 << (quantize_bits - 1)), (((1 << (quantize_bits - 1)) - 1))) for di in data_int
         ]
         data_int = torch.cat(data_int).reshape(data.shape)
         data_int = data_int.to(torch.int8)
@@ -37,8 +40,7 @@ class WeightQuantization(object):
                 (self.mp_size * data.shape[1]) / data.shape[0] == 3)
 
     def Quantize(self, value_list, quantize_bits, groups, key, merge_dim=0):
-        if self.mlp_extra_grouping and self.is_mlp(value_list[0],
-                                                   merge_count=len(value_list)):
+        if self.mlp_extra_grouping and self.is_mlp(value_list[0], merge_count=len(value_list)):
             groups *= 2
         q_scale = []
         index = 0
@@ -47,11 +49,8 @@ class WeightQuantization(object):
             q_scale.append(data_scale)
             value_list[index] = data_int
             index += 1
-        q_scale = (
-            1 /
-            torch.cat(q_scale,
-                      dim=merge_dim).to(
-                          get_accelerator().current_device_name()).view(-1).unsqueeze(0))
+        q_scale = (1 /
+                   torch.cat(q_scale, dim=merge_dim).to(get_accelerator().current_device_name()).view(-1).unsqueeze(0))
         if "mlp.dense_4h_to_h.weight" in key:
             self.mlp4hh_scales.append(q_scale)
         elif "mlp.dense_h_to_4h.weight" in key:
@@ -65,10 +64,7 @@ class WeightQuantization(object):
     def merge_layer_scales(self, layer_scales):
         max_dim = max([s.shape[-1] for s in layer_scales])
         layer_scales = [
-            torch.cat((s,
-                       torch.zeros((1,
-                                    max_dim - s.shape[-1]),
-                                   device=get_accelerator().current_device_name())),
+            torch.cat((s, torch.zeros((1, max_dim - s.shape[-1]), device=get_accelerator().current_device_name())),
                       dim=-1) if s.shape[-1] < max_dim else s for s in layer_scales
         ]
         return torch.cat(layer_scales).unsqueeze(0)
@@ -77,11 +73,7 @@ class WeightQuantization(object):
         all_scales = []
         for dense_scale, qkv_scale, m4hh_scale, mh4h_scale in \
             zip(self.dense_scales, self.qkv_scales, self.mlp4hh_scales, self.mlph4h_scales):
-            all_scales.append(
-                self.merge_layer_scales([qkv_scale,
-                                         dense_scale,
-                                         mh4h_scale,
-                                         m4hh_scale]))
+            all_scales.append(self.merge_layer_scales([qkv_scale, dense_scale, mh4h_scale, m4hh_scale]))
         return torch.cat(all_scales)
 
     def merge_scales_split(self, split_count):
@@ -95,13 +87,8 @@ class WeightQuantization(object):
             for s in range(split_count):
                 all_scales[s].append(
                     torch.cat([
-                        torch.cat((qkv_scale[s],
-                                   torch.zeros_like(qkv_scale[s])),
-                                  dim=1),
-                        torch.cat((dense_scale[s],
-                                   torch.zeros_like(dense_scale[s])),
-                                  dim=1),
-                        mh4h_scale[s],
+                        torch.cat((qkv_scale[s], torch.zeros_like(qkv_scale[s])), dim=1),
+                        torch.cat((dense_scale[s], torch.zeros_like(dense_scale[s])), dim=1), mh4h_scale[s],
                         m4hh_scale[s]
                     ]).unsqueeze(0))
             for scales_a in all_scales:
@@ -139,8 +126,7 @@ class WeightQuantization(object):
                 else:
                     data_quantized, data_scale = self.quantize_data(keys[key], quantize_bits, groups)
                 keys[key].copy_(data_quantized)
-                layer_scales.append((1 / data_scale.to(
-                    get_accelerator().current_device_name()).view(-1).unsqueeze(0)))
+                layer_scales.append((1 / data_scale.to(get_accelerator().current_device_name()).view(-1).unsqueeze(0)))
             all_scales.append(self.merge_layer_scales(layer_scales))
             return layer
 
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index bd881c8ec69fff6cb9a7f35d8b8d10639bfb9057..1ccca09a9e69806c6e03e94b5672b106cae6fc20 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .partition_parameters import ZeroParamType
 from .partition_parameters import ZeroParamStatus
@@ -11,3 +11,5 @@ from .partition_parameters import register_external_parameter
 
 from .tiling import TiledLinear
 from .tiling import TiledLinearReturnBias
+
+from .mics import MiCS_Init
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index d7ab552188b80d924ae96e597c197c77b4cdb49e..30d4ea3d4698fa62ce2e7c4de2639f23a20e4fb3 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from pydantic import Field, validator
 import sys
@@ -35,7 +34,8 @@ ZeRO optimization should be enabled as:
     "offload_param": {...},
     "offload_optimizer": {...},
     "ignore_unused_parameters": [true|false],
-    "round_robin_gradients": [true|false]
+    "round_robin_gradients": [true|false],
+    "memory_efficient_linear": [true|false]
     }
 }
 """
@@ -47,13 +47,10 @@ def read_zero_config_deprecated(param_dict):
     zero_config_dict = {}
     zero_config_dict["stage"] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
     if zero_config_dict["stage"] > 0:
-        zero_config_dict["allgather_bucket_size"] = get_scalar_param(
-            param_dict,
-            "allgather_size",
-            5e8)
+        zero_config_dict["allgather_bucket_size"] = get_scalar_param(param_dict, "allgather_size", 5e8)
     logger.warning(
-        "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}"
-        .format(ZERO_FORMAT))
+        "DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}".
+        format(ZERO_FORMAT))
     return zero_config_dict
 
 
@@ -161,9 +158,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
         None,
         deprecated=True,
         new_param="offload_param",
-        new_param_fn=(
-            lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
-            if val else None),
+        new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None),
     )
     """ Deprecated, please use ``offload_param`` """
 
@@ -179,31 +174,24 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
         None,
         deprecated=True,
         new_param="offload_optimizer",
-        new_param_fn=(
-            lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
-            if val else None),
+        new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None),
     )
     """ Deprecated, please use ``offload_optimizer`` """
 
-    prefetch_bucket_size: int = Field(pp_int(5e7),
-                                      ge=0,
-                                      alias="stage3_prefetch_bucket_size")
+    prefetch_bucket_size: int = Field(pp_int(5e7), ge=0, alias="stage3_prefetch_bucket_size")
     """
     Maximum number of parameter elements to fetch ahead of use. Used by ZeRO3,
     ZeRO3-Offload, ZeRO-Infinity, and ZeRO-Inference.
     """
 
-    param_persistence_threshold: int = Field(pp_int(1e5),
-                                             ge=0,
-                                             alias="stage3_param_persistence_threshold")
+    param_persistence_threshold: int = Field(pp_int(1e5), ge=0, alias="stage3_param_persistence_threshold")
     """
     Do not partition parameters smaller than this threshold. Smaller values use
     less memory, but can greatly increase communication (especially
     latency-bound messages).
     """
 
-    model_persistence_threshold: int = Field(pp_int(sys.maxsize,
-                                                    "sys.maxsize"),
+    model_persistence_threshold: int = Field(pp_int(sys.maxsize, "sys.maxsize"),
                                              ge=0,
                                              alias="stage3_model_persistence_threshold")
     """
@@ -213,9 +201,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     ZeRO3-Offload, ZeRO-Infinity and ZeRO-Inference.
     """
 
-    max_live_parameters: int = Field(pp_int(1e9),
-                                     ge=0,
-                                     alias="stage3_max_live_parameters")
+    max_live_parameters: int = Field(pp_int(1e9), ge=0, alias="stage3_max_live_parameters")
     """
     The maximum number of parameters resident per GPU before releasing. Smaller
     values use less memory, but perform more communication.
@@ -227,9 +213,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     parameters. Smaller values use less memory, but perform more communication.
     """
 
-    gather_16bit_weights_on_model_save: bool = Field(
-        False,
-        alias="stage3_gather_16bit_weights_on_model_save")
+    gather_16bit_weights_on_model_save: bool = Field(False, alias="stage3_gather_16bit_weights_on_model_save")
     """
     Consolidate the weights before saving the model by ``save_16bit_model()``.
     Since the weights are partitioned across GPUs, they aren’t part of
@@ -237,10 +221,9 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     this option is enabled and then saves the fp16 model weights.
     """
 
-    stage3_gather_fp16_weights_on_model_save: bool = Field(
-        False,
-        deprecated=True,
-        new_param="gather_16bit_weights_on_model_save")
+    stage3_gather_fp16_weights_on_model_save: bool = Field(False,
+                                                           deprecated=True,
+                                                           new_param="gather_16bit_weights_on_model_save")
     """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
     ignore_unused_parameters: bool = True
@@ -266,12 +249,18 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     between optimizer steps) or GPU count (increased parallelism).
     """
 
+    mics_shard_size: int = Field(-1, new_param="mics_shard_size")
+
+    mics_hierarchical_params_gather: bool = False
+    memory_efficient_linear: bool = True
+    """
+    Use memory efficient linear implementation, for Stage 3.
+    """
+
     # Validators
     @validator("overlap_comm")
     def overlap_comm_valid(cls, field_value, values):
         if field_value is None:
-            assert (
-                "stage" in values
-            ), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
+            assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
             field_value = values["stage"] == ZeroStageEnum.weights
         return field_value
diff --git a/deepspeed/runtime/zero/constants.py b/deepspeed/runtime/zero/constants.py
deleted file mode 100644
index af5c5f1953983966454cd161568ad04b20b15221..0000000000000000000000000000000000000000
--- a/deepspeed/runtime/zero/constants.py
+++ /dev/null
@@ -1,173 +0,0 @@
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
-
-from .offload_constants import *
-
-#########################################
-# ZeRO optimization
-#########################################
-# ZeRO optimization. By default, this optimization is not enabled.
-# Users have to configure the desired optimization (0 means disabled) in params.json as below example:
-ZERO_FORMAT = '''
-ZeRO optimization should be enabled as:
-"session_params": {
-  "zero_optimization": {
-    "stage": [0|1|2],
-    "stage3_max_live_parameters" : 1000000000,
-    "stage3_max_reuse_distance" : 1000000000,
-    "allgather_partitions": [true|false],
-    "allgather_bucket_size": 500000000,
-    "reduce_scatter": [true|false],
-    "contiguous_gradients" : [true|false]
-    "overlap_comm": [true|false],
-    "reduce_bucket_size": 500000000,
-    "load_from_fp32_weights": [true|false],
-    "cpu_offload": [true|false] (deprecated),
-    "cpu_offload_params" : [true|false] (deprecated),
-    "cpu_offload_use_pin_memory": [true|false] (deprecated),
-    "sub_group_size" : 1000000000000,
-    "offload_param": {...},
-    "offload_optimizer": {...},
-    "ignore_unused_parameters": [true|false],
-    "round_robin_gradients": [true|false]
-    }
-}
-'''
-
-ZERO_OPTIMIZATION = 'zero_optimization'
-ZERO_OPTIMIZATION_DISABLED = 0
-ZERO_OPTIMIZATION_OPTIMIZER_STATES = 1
-ZERO_OPTIMIZATION_GRADIENTS = 2
-ZERO_OPTIMIZATION_WEIGHTS = 3
-MAX_STAGE_ZERO_OPTIMIZATION = ZERO_OPTIMIZATION_WEIGHTS
-
-ZERO_OPTIMIZATION_STAGE = 'stage'
-ZERO_OPTIMIZATION_STAGE_1 = 'stage_1'
-ZERO_OPTIMIZATION_STAGE_2 = 'stage_2'
-ZERO_OPTIMIZATION_STAGE_3 = 'stage_3'
-
-ZERO_OPTIMIZATION_STAGE_DEFAULT = ZERO_OPTIMIZATION_DISABLED
-
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS = 'allgather_partitions'
-ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT = True
-
-ZERO_OPTIMIZATION_REDUCE_SCATTER = 'reduce_scatter'
-ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT = True
-
-ZERO_OPTIMIZATION_OVERLAP_COMM = 'overlap_comm'
-ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT = False
-ZERO3_OPTIMIZATION_OVERLAP_COMM_DEFAULT = True
-
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS = 'contiguous_gradients'
-ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = True
-ZERO3_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT = True
-
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE = 'reduce_bucket_size'
-ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT = 500000000
-
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE = 'allgather_bucket_size'
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT = 500000000
-ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED = 'allgather_size'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS = 'load_from_fp32_weights'
-ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT = True
-
-ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT = 'elastic_checkpoint'
-ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT = False
-
-ZERO_OPTIMIZATION_CPU_OFFLOAD = 'cpu_offload'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_DEFAULT = False
-
-ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS = 'cpu_offload_params'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS_DEFAULT = False
-
-ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY = 'cpu_offload_use_pin_memory'
-ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY_DEFAULT = False
-
-ZERO_OPTIMIZATION_OFFLOAD_PARAM = OFFLOAD_PARAM
-ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT = None
-
-ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER = OFFLOAD_OPTIMIZER
-ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT = None
-
-ZERO_OPTIMIZATION_SUB_GROUP_SIZE = 'sub_group_size'
-ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT = 1000000000
-
-#maximum number of parameters per GPU before releasing them
-ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS = 'stage3_max_live_parameters'
-ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS_DEFAULT = 1000000000
-
-#release a parameter only if the reuse distance is larger than specified
-ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE = 'stage3_max_reuse_distance'
-ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE_DEFAULT = 1000000000
-
-ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE = 'stage3_prefetch_bucket_size'
-ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000
-
-#parameters smaller than the threshold are only communicated once after the
-#parameters are updated and are persisted throughout the training
-#avoid tons of latency bound communication
-ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
-ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000
-
-# gathers params for saving a model - inefficient but is required in certain situations
-ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
-ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_16bit_weights_on_model_save'
-ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
-
-# Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
-# Enable this option to avoid:
-# https://github.com/microsoft/DeepSpeed/issues/707
-ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS = 'ignore_unused_parameters'
-ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT = True
-
-# Use deepspeed < v0.3.17 zero stage 1, kept for backwards compatibility reasons
-ZERO_OPTIMIZATION_LEGACY_STAGE1 = "legacy_stage1"
-ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT = False
-
-# Stage 2 - partition gradients in a round robin fashion to load-balance reduction and offload copying
-ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS = 'round_robin_gradients'
-ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT = False
-
-#yapf: disable
-ZERO_OPTIMIZATION_DEFAULT = {
-    ZERO_OPTIMIZATION_STAGE:
-    ZERO_OPTIMIZATION_STAGE_DEFAULT,
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS:
-    ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_SCATTER:
-    ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT,
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS:
-    ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT,
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS:
-    ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT,
-    ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT:
-    ZERO_OPTIMIZATION_ELASTIC_CHECKPOINT_DEFAULT,
-    ZERO_OPTIMIZATION_OFFLOAD_PARAM:
-    ZERO_OPTIMIZATION_OFFLOAD_PARAM_DEFAULT,
-    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER:
-    ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER_DEFAULT,
-    ZERO_OPTIMIZATION_SUB_GROUP_SIZE:
-    ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS:
-    ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS_DEFAULT,
-    ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE:
-    ZERO_OPTIMIZATION_MAX_REUSE_DISTANCE_DEFAULT,
-    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE:
-    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
-    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
-    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
-    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE:
-    ZERO_OPTIMIZATION_GATHER_16BIT_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
-    ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS:
-    ZERO_OPTIMIZATION_IGNORE_UNUSED_PARAMETERS_DEFAULT,
-    ZERO_OPTIMIZATION_LEGACY_STAGE1:
-    ZERO_OPTIMIZATION_LEGACY_STAGE1_DEFAULT,
-    ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS:
-    ZERO_OPTIMIZATION_ROUND_ROBIN_GRADIENTS_DEFAULT
-}
diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
index 46e55787e6642d330019cb1ac52d26d912f0d148..79f01e5466b9cdedd99f93c5b975590b0cb899a8 100644
--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -11,6 +14,7 @@ def print_rank_0(message):
 
 
 class ContiguousMemoryAllocator(object):
+
     def __init__(self, size, dtype, device):
         self.buffer = torch.zeros(size, dtype=dtype, device=device)
 
@@ -98,8 +102,7 @@ class ContiguousMemoryAllocator(object):
         self._unassign_params(tensor_id)
         self.total_free += tensor_size
         print_rank_0(
-            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
-        )
+            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.")
         assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     def release_tensor_with_id(self, tensor_id):
@@ -111,8 +114,7 @@ class ContiguousMemoryAllocator(object):
         self._unassign_params(tensor_id)
         self.total_free += tensor_size
         print_rank_0(
-            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}."
-        )
+            f"Free before release {free_before}. Released {tensor.numel()}. Total free after {self.total_free}.")
         assert self.total_free - tensor_size == free_before, "Release bookkeeping error"
 
     #shows the current memory allocation at specified resolution
@@ -136,9 +138,7 @@ class ContiguousMemoryAllocator(object):
     def _reset_param_data(self):
         for id, tensor in self.tensor_map.items():
             for param in self.id_to_params[id]:
-                param.data = tensor.narrow(0,
-                                           0,
-                                           param.numel()).view(param.data.shape).data
+                param.data = tensor.narrow(0, 0, param.numel()).view(param.data.shape).data
 
     def _unassign_params(self, tensor_id):
         if tensor_id in self.id_to_params.keys():
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 6fbcabb1675a8a2d88e1d6dde7d08e8ce56585eb..175013361fb8ee2cfad8be3dcd9bc50748c8cacc 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #Linear Module to use with ZeRO Stage 3 to allow for parameter memory release
 #after the module execution during forward
@@ -23,23 +26,16 @@ from deepspeed.runtime.utils import noop_decorator
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
-tensor_map = {}
-
 
 def print_rank_0(message, debug=False, force=False):
     if dist.get_rank() == 0 and (debug or force):
         print(message)
 
 
-device = get_accelerator().device_name()
-if device == 'cuda':
-    try:
-        autocast_custom_fwd = torch.cuda.amp.custom_fwd
-        autocast_custom_bwd = torch.cuda.amp.custom_bwd
-    except (ImportError, AttributeError) as exp:
-        autocast_custom_fwd = noop_decorator
-        autocast_custom_bwd = noop_decorator
-else:
+try:
+    autocast_custom_fwd = get_accelerator().amp().custom_fwd
+    autocast_custom_bwd = get_accelerator().amp().custom_bwd
+except (ImportError, AttributeError) as exp:
     autocast_custom_fwd = noop_decorator
     autocast_custom_bwd = noop_decorator
 
@@ -51,16 +47,8 @@ class LinearFunctionForZeroStage3(torch.autograd.Function):
     @autocast_custom_fwd
     # bias is an optional argument
     def forward(ctx, input, weight, bias=None):
-        #print("In ZeRO Linear Function")
-
-        weight_id = id(weight)
-        bias_id = id(bias)
 
-        #ctx.save_for_backward(input, weight, bias)
-        ctx.save_for_backward(input, torch.tensor(weight_id), torch.tensor(bias_id))
-
-        tensor_map[weight_id] = weight
-        tensor_map[bias_id] = bias
+        ctx.save_for_backward(input, weight, bias)
 
         if input.dim() == 2 and bias is not None:
             # fused op is marginally faster
@@ -82,11 +70,7 @@ class LinearFunctionForZeroStage3(torch.autograd.Function):
         # None. Thanks to the fact that additional trailing Nones are
         # ignored, the return statement is simple even when the function has
         # optional inputs.
-        #input, weight, bias = ctx.saved_tensors
-
-        input, weight_id, bias_id = ctx.saved_tensors
-        weight = tensor_map[weight_id.item()]
-        bias = tensor_map[bias_id.item()]
+        input, weight, bias = ctx.saved_tensors
 
         grad_input = grad_weight = grad_bias = None
 
@@ -104,9 +88,7 @@ class LinearFunctionForZeroStage3(torch.autograd.Function):
             dim = grad_output.dim()
             if dim > 2:
                 grad_weight = grad_output.reshape(-1,
-                                                  grad_output.shape[-1]).t().matmul(
-                                                      input.reshape(-1,
-                                                                    input.shape[-1]))
+                                                  grad_output.shape[-1]).t().matmul(input.reshape(-1, input.shape[-1]))
             else:
                 grad_weight = grad_output.t().matmul(input)
             #print(f"Computed grad weight grad_weight {grad_weight.shape}")
@@ -189,7 +171,5 @@ class LinearModuleForZeroStage3(Module):
         return LinearFunctionForZeroStage3.apply(input, self.weight, self.bias)
 
     def extra_repr(self) -> str:
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py
new file mode 100755
index 0000000000000000000000000000000000000000..09d2fedbc60642fb060d400541c5e966cdb375be
--- /dev/null
+++ b/deepspeed/runtime/zero/mics.py
@@ -0,0 +1,453 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+from typing import List
+
+import deepspeed
+import torch
+from deepspeed import comm as dist
+from deepspeed.runtime.zero.mics_utils import (MiCS_CommGroups, create_mics_comm_groups, scale_tensors)
+from deepspeed.runtime.zero.parameter_offload import (DeepSpeedZeRoOffload, is_zero_param)
+from deepspeed.runtime.zero.partition_parameters import Init, AllGatherCoalescedHandle, ZeroParamStatus
+from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+from deepspeed.utils import instrument_w_nvtx, log_dist
+from torch import Tensor
+from torch.nn import Parameter
+
+
+def has_hierarchical_all_gather_groups(comm_groups: MiCS_CommGroups):
+    result = False
+    if comm_groups.param_intra_node_group is not None and comm_groups.param_inter_node_shard_group is not None:
+        result = True
+    return result
+
+
+class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle):
+    """ This handle assumes that no need to
+    copy data out from a contiguous tensor
+    """
+
+    def __init__(self, allgather_handle, params: List[Parameter], partitions: List[Tensor], world_size: int) -> None:
+        super().__init__(allgather_handle, params, partitions, world_size)
+
+    def wait(self) -> None:
+        """
+        """
+        # let the current stream to op
+        instrument_w_nvtx(self.allgather_handle.wait)()
+        if self.complete:
+            return
+
+        for _, param in enumerate(self.params):
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.complete = True
+
+
+class MiCS_Init(Init):
+
+    def __init__(self,
+                 module=None,
+                 data_parallel_group=None,
+                 mem_efficient_linear=True,
+                 remote_device=None,
+                 pin_memory=False,
+                 config_dict_or_path=None,
+                 config=None,
+                 enabled=True,
+                 dtype=None,
+                 mpu=None):
+        """A context manager to partition the model parameters during the model
+        construction with MiCS partition strategy. Model states are partitioned
+        to the number of devices specified via ``mics_shard_size`` field in the
+        deepspeed config json file. The context manager also introduces
+        hierarchical communication method to reduce the cost of inter-node
+        communications, which can be enabled with
+        ``mics_hierarchical_params_gather`` field in deepspeed config.
+
+        Args:
+            module (``torch.nn.Module``, optional): If provided, partition the model as
+                if it was constructed in the context.
+            data_parallel_group (``deepspeed.comm`` process group, optional):
+                The group of processes to partition among. Defaults to all processes.
+            mem_efficient_linear (bool, optional): Replace
+                torch.nn.functional.linear with an implementation that allows
+                DeepSpeed to partition parameters. Defaults to ``True``.
+            remote_device (string, optional): The initial device to store model
+                weights e.g., ``cpu``, ``nvme``. Passing ``"cpu"`` will create the model in CPU
+                memory. The model may still be moved to GPU based on the
+                offload settings for training. Defaults to param offload device if a config is
+                defined, otherwise GPU.
+            pin_memory (bool, optional): Potentially increase performance by
+                using pinned memory for model weights. ``remote_device`` must be
+                ``"cpu"``. Defaults to pin_memory value in config, otherwise ``False``.
+            config_dict_or_path (dict or ``json file``, optional): If provided, provides configuration
+                for swapping fp16 params to NVMe.
+            config (dict or ``json file``, optional): Deprecated, use config_dict_or_path instead.
+            enabled (bool, optional): If ``False``, this context has no
+                effect. Defaults to ``True``.
+            dtype (``dtype``, optional): Can be used to change the data type of the parameters.
+                Supported options are ``torch.half`` and ``torch.float``. Defaults to ``None``
+            mpu (``object``, optional): A model parallelism unit object that implements get_{model,data}_parallel_{rank,group,world_size}.
+
+        This context follows the same logic as ``deepspeed.zero.Init()``, but
+        with the modification for partition size of each parameter.
+
+        Examples
+        --------
+
+        #. Allocate a model and partition it among all processes:
+
+            .. code-block:: python
+                # the config_dict_or_path is required to let the context manager know
+                # how partition the parameters.
+                # The configuration has to include the field ``mics_shard_size``
+                with deepspeed.zero.MiCS_Init(config_dict_or_path=ds_config):
+                    model = MyLargeModel()
+
+
+        #. Allocate a model in pinned CPU memory and partition it among a subgroup of processes:
+
+            .. code-block:: python
+
+                with deepspeed.zero.MiCS_Init(data_parallel_group=mpu.get_data_parallel_group(),
+                                              remote_device="cpu",
+                                              pin_memory=True
+                                              config_dict_or_path=ds_config):
+                    model = MyLargeModel()
+
+
+        #. Partition an already-allocated model in CPU memory:
+
+            .. code-block:: python
+
+                model = deepspeed.zero.MiCS_Init(module=model,
+                                                 config_dict_or_path=ds_config)
+        """
+
+        assert config_dict_or_path is not None, "Must provide configuration for MiCS Initialization"
+        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path, mpu)
+        if not dist.is_initialized():
+            dist.init_distributed()
+            assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
+        self.mics_comm_groups = create_mics_comm_groups(
+            _ds_config.mics_shard_size,
+            data_parallel_group,
+            hierarchical_allgather=_ds_config.mics_hierarchial_params_gather,
+            mpu=mpu)
+
+        super().__init__(module, data_parallel_group, mem_efficient_linear, remote_device, pin_memory,
+                         config_dict_or_path, config, enabled, dtype, mpu)
+
+    def _convert_to_deepspeed_param(self, param):
+        super()._convert_to_deepspeed_param(param)
+        # attach communication groups to every param
+        param.comm = self.mics_comm_groups
+
+        # record existing all_gather_coalesced implementation
+        # so that we can fallback later
+        old_all_gather_coalesced = param.all_gather_coalesced
+
+        def _param_all_gather_coalesced(params, safe_mode=False, param_buffers=None):
+            """"""
+            mics_comm_groups: MiCS_CommGroups = params[0].comm
+            hierarchical_all_gather = has_hierarchical_all_gather_groups(mics_comm_groups)
+            if dist.has_coalescing_manager() and hierarchical_all_gather:
+                return self._hierarchical_all_gather_params(params, param_buffers)
+            elif dist.has_coalescing_manager():
+                return self._flat_all_gather_with_coalescing_manager(params, param_buffers)
+            else:
+                return old_all_gather_coalesced(params, safe_mode)
+
+        # change the all_gather_coalesced method
+        param.all_gather_coalesced = _param_all_gather_coalesced
+
+    def _pre_all_gather(self, params, params_buffers=None):
+        # fetches from nvme if the partition is not available and in nvme
+        self._ensure_availability_of_partitioned_params(params)
+
+        for param in params:
+            if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
+                raise RuntimeError(param.ds_summary())
+            param.ds_status = ZeroParamStatus.INFLIGHT
+
+        # ensure that each rank has params in same order. the allgather
+        # is done by flattening the parameter list into a single tensor that
+        # can be allgathered in a single call - this means that if each rank
+        # gives a list of the same parameters in a different order we will
+        # silently get incorrect parameter values, and have very difficult
+        # to debug correctness issues.
+        params = sorted(params, key=lambda p: p.ds_id)
+        return params, params_buffers
+
+    def _flat_all_gather_with_coalescing_manager(self, params, params_buffers=None):
+        """"""
+        # must have to change the status of the param
+        # and ensure they are on the device
+        params, params_buffers = self._pre_all_gather(params, params_buffers)
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        param_shard_size = mics_comm_groups.param_shard_size
+
+        output_tensors = []
+        input_tensors = []
+        for i, p in enumerate(params):
+            t_size = p.ds_tensor.ds_numel * param_shard_size
+            if params_buffers is not None and params_buffers[i] is not None:
+                assert params_buffers[i].numel(
+                ) == t_size, f'params_to_gather_buffers[{i}] size {params_buffers[i].numel()} does not match with t_size {t_size}'
+                flat_out = params_buffers[i]
+            else:
+                flat_out = torch.empty(t_size, dtype=p.dtype, device=self.local_device, requires_grad=False).view(-1)
+            output_tensors.append(flat_out)
+            _flat_input = p.ds_tensor.data.view(-1)
+            input_tensors.append(_flat_input)
+
+        all_gather_handle = dist.all_gather_coalesced(output_tensors,
+                                                      input_tensors,
+                                                      group=mics_comm_groups.param_shard_group,
+                                                      async_op=True)
+
+        for idx, param in enumerate(params):
+            param.data = output_tensors[idx].narrow(0, 0, param.ds_numel).view(param.ds_shape).data
+
+        return MiCS_AllGatherCoalescedHandle(allgather_handle=all_gather_handle,
+                                             params=params,
+                                             partitions=[],
+                                             world_size=param_shard_size)
+
+    def _hierarchical_all_gather_params(self, params, params_buffers=None):
+        """"""
+        params, params_buffers = self._pre_all_gather(params, params_buffers)
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        local_rank = dist.get_rank(group=mics_comm_groups.param_intra_node_group)
+        inter_node_comm_group = mics_comm_groups.param_inter_node_shard_group
+        intra_node_comm_group = mics_comm_groups.param_intra_node_group
+        param_shard_size = mics_comm_groups.param_shard_size
+
+        inter_node_size = dist.get_world_size(group=inter_node_comm_group)
+        intra_node_size = dist.get_world_size(group=intra_node_comm_group)
+        param_tensors = []
+        for i, p in enumerate(params):
+            param_size = p.ds_tensor.ds_numel * param_shard_size
+            if params_buffers is not None and params_buffers[i] is not None:
+                assert params_buffers[i].numel(
+                ) == param_size, f'param_buffers[{i}] size {params_buffers[i].numel()} does not match with param_size {param_size}'
+                param_tensor = params_buffers[i]
+            else:
+                param_tensor = torch.empty(param_size, dtype=p.dtype, device=self.local_device,
+                                           requires_grad=False).view(-1)
+            param_tensors.append(param_tensor)
+
+        # inter node all-gather
+        inter_outputs = []
+        inter_inputs = []
+        for i, p in enumerate(params):
+            inter_size = p.ds_tensor.ds_numel * inter_node_size
+            _out = param_tensors[i].narrow(0, local_rank * inter_size, inter_size)
+            inter_outputs.append(_out)
+            inter_inputs.append(p.ds_tensor.data.view(-1).to(self.local_device))
+        # sync enqueue
+        dist.all_gather_coalesced(inter_outputs, inter_inputs, group=inter_node_comm_group, async_op=False)
+
+        # intra node all-gather
+        intra_outputs = []
+        intra_inputs = []
+        for i, p in enumerate(params):
+            # partition param into multiple chunks for allgather
+            # because inter-node all-gather outputs are in a continues memory
+            # while in param memory, those inter-node data are placed in different
+            # location.
+            # each chunk is an intra-node output
+            param_chunk = param_tensors[i].view(
+                (inter_node_size, intra_node_size, p.ds_tensor.ds_numel)).narrow(1, local_rank, 1)
+            param_chunk.copy_(inter_outputs[i].detach().clone().view(param_chunk.size()))
+            output_chunks = torch.chunk(param_tensors[i], inter_node_size)
+            for j, _out in enumerate(output_chunks):
+                intra_chunk_size = intra_node_size * p.ds_tensor.ds_numel
+                local_offset = local_rank * p.ds_tensor.ds_numel
+                _in = param_tensors[i].narrow(0, j * intra_chunk_size + local_offset, p.ds_tensor.ds_numel)
+                intra_outputs.append(_out)
+                intra_inputs.append(_in)
+
+        all_gather_handle = dist.all_gather_coalesced(intra_outputs,
+                                                      intra_inputs,
+                                                      group=intra_node_comm_group,
+                                                      async_op=True)
+        for i, param in enumerate(params):
+            param.data = param_tensors[i].narrow(0, 0, param.ds_numel).view(param.ds_shape).data
+
+        return MiCS_AllGatherCoalescedHandle(
+            allgather_handle=all_gather_handle,
+            params=params,
+            partitions=[],
+            world_size=param_shard_size,
+        )
+
+    def get_partition_dp_group(self, param):
+        return param.comm.param_shard_group
+
+    def get_partition_rank(self):
+        return self.mics_comm_groups.param_shard_rank
+
+    @property
+    def num_partitions(self):
+        return self.mics_comm_groups.param_shard_size
+
+
+class MiCS_Offload(DeepSpeedZeRoOffload):
+    """ Wrapper to change the behavior for parameter sharding
+    """
+
+    def __init__(self,
+                 module,
+                 timers,
+                 ds_config,
+                 overlap_comm=True,
+                 prefetch_bucket_size=50000000,
+                 max_reuse_distance=1000000000,
+                 max_live_parameters=1000000000,
+                 param_persistence_threshold=100000,
+                 model_persistence_threshold=sys.maxsize,
+                 offload_param_config=None,
+                 mpu=None):
+        super().__init__(module, timers, ds_config, overlap_comm, prefetch_bucket_size, max_reuse_distance,
+                         max_live_parameters, param_persistence_threshold, model_persistence_threshold,
+                         offload_param_config, mpu)
+
+    def _convert_to_zero_parameters(self, ds_config, module, mpu):
+        """ overload the parent class function for convert the parameters
+
+        """
+        log_dist(f'Convert to zero parameters from MiCS Offload manager', ranks=[0])
+        non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
+        if non_zero_params:
+            zero_params = [p for p in module.parameters() if is_zero_param(p)]
+            if zero_params:
+                zero_params[0].convert_to_zero_parameters(param_list=non_zero_params)
+            else:
+                group = None
+                if mpu:
+                    group = mpu.get_data_parallel_group()
+
+                MiCS_Init(module=module,
+                          data_parallel_group=group,
+                          dtype=self.dtype,
+                          config_dict_or_path=ds_config,
+                          remote_device=self.offload_device,
+                          pin_memory=self.offload_param_pin_memory,
+                          mpu=mpu)
+
+
+class MiCS_Optimizer(DeepSpeedZeroOptimizer_Stage3):
+    """
+    MiCS Optimizer
+    """
+
+    def __init__(self,
+                 module,
+                 init_optimizer,
+                 timers,
+                 ds_config,
+                 static_loss_scale=1,
+                 dynamic_loss_scale=False,
+                 dynamic_loss_args=None,
+                 verbose=True,
+                 contiguous_gradients=True,
+                 reduce_bucket_size=500000000,
+                 prefetch_bucket_size=50000000,
+                 max_reuse_distance=1000000000,
+                 max_live_parameters=1000000000,
+                 param_persistence_threshold=100000,
+                 model_persistence_threshold=sys.maxsize,
+                 dp_process_group=None,
+                 reduce_scatter=True,
+                 overlap_comm=False,
+                 offload_optimizer_config=None,
+                 offload_param_config=None,
+                 sub_group_size=1000000000000,
+                 mpu=None,
+                 clip_grad=0,
+                 communication_data_type=torch.float16,
+                 postscale_gradients=True,
+                 gradient_predivide_factor=1,
+                 gradient_accumulation_steps=1,
+                 elastic_checkpoint=False,
+                 aio_config=None):
+
+        log_dist("Init MiCS optimizer", ranks=[0])
+        super().__init__(module, init_optimizer, timers, ds_config, static_loss_scale, dynamic_loss_scale,
+                         dynamic_loss_args, verbose, contiguous_gradients, reduce_bucket_size, prefetch_bucket_size,
+                         max_reuse_distance, max_live_parameters, param_persistence_threshold,
+                         model_persistence_threshold, dp_process_group, reduce_scatter, overlap_comm,
+                         offload_optimizer_config, offload_param_config, sub_group_size, mpu, clip_grad,
+                         communication_data_type, postscale_gradients, gradient_predivide_factor,
+                         gradient_accumulation_steps, elastic_checkpoint, aio_config)
+        first_param = next(module.parameters())
+        # overload the dp_process_group and partition_count
+        self.dp_process_group = first_param.comm.param_shard_group
+        self.partition_count = first_param.comm.param_shard_size
+
+    def initialize_ds_offload(self, module, timers, ds_config, overlap_comm, prefetch_bucket_size, max_reuse_distance,
+                              max_live_parameters, param_persistence_threshold, model_persistence_threshold,
+                              offload_optimizer_config, mpu):
+        return MiCS_Offload(module, timers, ds_config, overlap_comm, prefetch_bucket_size, max_reuse_distance,
+                            max_live_parameters, param_persistence_threshold, model_persistence_threshold,
+                            offload_optimizer_config, mpu)
+
+    def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
+        grad_buffers = super().partition_grads(params_to_release, grad_partitions)
+        # perform all-reduce among replication groups
+        # the function will perform accumulation boundary check
+        self.allreduce_mics_shard_grads(params_to_release, grad_buffers)
+
+    @instrument_w_nvtx
+    def allreduce_mics_shard_grads(self, params, partitioned_grads_buffers: List[Tensor]):
+        """
+        """
+        # TODO: improve the condition check
+        if not self.is_gradient_accumulation_boundary or \
+            len(partitioned_grads_buffers) == 0:
+            return
+
+        mics_comm_groups: MiCS_CommGroups = params[0].comm
+        param_repli_group = mics_comm_groups.param_repli_group
+        param_repli_size = mics_comm_groups.param_repli_size
+
+        if param_repli_size is None or param_repli_size <= 1:
+            return
+        if not partitioned_grads_buffers[0].is_cuda:
+            raise RuntimeError("Local sharding has no support for CPU offloading")
+
+        if dist.has_all_reduce_coalesced():
+            scale_tensors(partitioned_grads_buffers, param_repli_size)
+            dist.all_reduce_coalesced(tensors=partitioned_grads_buffers, group=param_repli_group)
+        else:
+            # manually coalescing all-reduce
+            aggregated_buffer: Tensor = torch.cat(partitioned_grads_buffers)
+            aggregated_buffer.div_(param_repli_size)
+            dist.all_reduce(aggregated_buffer, group=param_repli_group)
+            offset = 0
+            for grad_buff in partitioned_grads_buffers:
+                grad_buff.view(-1).copy_(aggregated_buffer.narrow(0, offset, grad_buff.numel()))
+                offset += grad_buff.numel()
+
+    # TODO: Support different/changing load/save DP degree.
+    def load_state_dict(self,
+                        state_dict_list,
+                        load_optimizer_states=True,
+                        load_from_fp32_weights=False,
+                        checkpoint_folder=None):
+        r""" Loading the MiCS checkpoints
+
+        TODO: move the implementation from zhen/merged_ds_master branch
+        """
+        raise NotImplementedError("Not implemented for loading MiCS checkpoints")
diff --git a/deepspeed/runtime/zero/mics_utils.py b/deepspeed/runtime/zero/mics_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b83160bd6c4cd93f5250a12c67f5abda01d920
--- /dev/null
+++ b/deepspeed/runtime/zero/mics_utils.py
@@ -0,0 +1,203 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from dataclasses import dataclass
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.utils import logger
+
+
+def _log_rank0(msg):
+    if dist.get_rank() == 0:
+        logger.info(msg)
+
+
+@torch.jit.script
+def scale_tensors(tensors: List[Tensor], scale: int):
+    for t in tensors:
+        t.div_(scale)
+
+
+@dataclass
+class MiCS_CommGroups:
+    """"""
+    param_shard_group = None
+    param_shard_size = -1
+    param_shard_rank = -1
+
+    param_repli_group = None
+    param_repli_size = -1
+    param_repli_rank = -1
+
+    param_intra_node_group = None
+    param_inter_node_shard_group = None
+
+
+def create_mics_comm_groups(
+    shard_size,
+    dp_group,
+    hierarchical_allgather=False,
+    mpu=None,
+):
+    """
+    create shard-group, replicate-group from config_file
+    TODO: consider broadcast the config from rank0
+
+    Returns:
+        MiCS_CommGroups
+    """
+    # env var for debugging purpose
+    ndevices_per_node = int(os.environ.get("NDEV_PER_NODE", get_accelerator().device_count()))
+    _log_rank0(f'creating MiCS communication groups with per node device size {ndevices_per_node}')
+    groups = MiCS_CommGroups()
+
+    if mpu is not None:
+        assert dp_group == mpu.get_data_parallel_group()
+
+    # full size of the world
+    world_size = dist.get_world_size()
+    # global rank
+    global_rank = dist.get_rank()
+
+    config = _generate_mics_config(world_size, ndevices_per_node, shard_size, 1)
+    ranks_of_shard_group = config['shard_groups']
+    ranks_of_repli_group = config['replicate_groups']
+    if len(ranks_of_repli_group) == 0:
+        assert len(ranks_of_shard_group) == 1, "replicate groups are empty only for single shard group"
+        for r in ranks_of_shard_group[0]:
+            ranks_of_repli_group.append([r])
+
+    # for simplicity
+    assert _sizes_all_same(ranks_of_repli_group), "replicate groups must have the same size"
+    assert _sizes_all_same(ranks_of_shard_group), "shard groups must have the same size"
+
+    assert sum([len(g) for g in ranks_of_shard_group]) == dist.get_world_size(), "all sharded ranks "
+    if len(ranks_of_shard_group) > 1:  # if only shard on one group then no need for replicate groups
+        assert len(ranks_of_shard_group) == len(
+            ranks_of_repli_group[0]), "number of shard groups must equal to the size of each replicate group"
+
+    global_rank = dist.get_rank()
+    # create shard groups
+    for shard_ranks in ranks_of_shard_group:
+        _group = dist.new_group(shard_ranks)
+        if global_rank in shard_ranks:
+            groups.param_shard_group = _group
+            groups.param_shard_size = len(shard_ranks)
+            groups.param_shard_rank = dist.get_rank(_group)
+            logger.info(f'rank {global_rank}, shard group'
+                        f' {groups.param_shard_rank}/{dist.get_world_size(group=_group)}')
+
+    # create replicate groups
+    for repli_ranks in ranks_of_repli_group:
+        if len(repli_ranks) > 1:
+            _group = dist.new_group(repli_ranks)
+            if global_rank in repli_ranks:
+                groups.param_repli_group = _group
+                groups.param_repli_size = len(repli_ranks)
+                groups.param_repli_rank = dist.get_rank(group=_group)
+                logger.info(f'rank {global_rank} '
+                            f'replicate group {groups.param_repli_rank}/{dist.get_world_size(group=_group)}')
+        else:
+            groups.param_repli_group = None
+            groups.param_repli_size = 1
+            groups.param_repli_rank = 0
+            logger.info(f'rank {global_rank} replicate group 0/1')
+
+    # assign shard group size as world size
+    assert groups.param_shard_size == len(ranks_of_shard_group[0])
+
+    if hierarchical_allgather:
+        # create hierarchy inter-node, intra-node groups
+        # n_span_nodes = config['shard_span']
+        n_span_nodes = config['span_nodes']
+        assert n_span_nodes > 1, "sharding spans on single node, no need for hierarchy allgather"
+        assert len(ranks_of_shard_group[0]) % n_span_nodes == 0
+
+        n_gpu_per_node = len(ranks_of_shard_group[0]) // n_span_nodes
+        intra_node_ranks_group = []
+        inter_node_ranks_group = []
+        for shard_group in ranks_of_shard_group:
+            _intra_node_ranks = []
+            for i in range(0, len(shard_group), n_gpu_per_node):
+                _intra_node_ranks.append(shard_group[i:i + n_gpu_per_node])
+            _inter_node_ranks = []
+            for i in range(n_gpu_per_node):
+                _ranks = [_g[i] for _g in _intra_node_ranks]
+                _inter_node_ranks.append(_ranks)
+
+            intra_node_ranks_group.append(_intra_node_ranks)
+            inter_node_ranks_group.append(_inter_node_ranks)
+
+        _log_rank0(f"create for hierarchy all-gather groups: intra nodes {intra_node_ranks_group}")
+        _log_rank0(f"create for hierarchy all-gather groups: inter nodes {inter_node_ranks_group}")
+
+        # create communicators
+        for shard_group in intra_node_ranks_group:
+            for intra_node_ranks in shard_group:
+                _group = dist.new_group(intra_node_ranks)
+                if global_rank in intra_node_ranks:
+                    groups.param_intra_node_group = _group
+                _log_rank0(f'create group for intra node ranks {intra_node_ranks}')
+
+        for shard_group in inter_node_ranks_group:
+            for inter_node_ranks in shard_group:
+                _group = dist.new_group(inter_node_ranks)
+                if global_rank in inter_node_ranks:
+                    groups.param_inter_node_shard_group = _group
+                _log_rank0(f'create group for inter node ranks {inter_node_ranks}')
+    return groups
+
+
+def _generate_mics_config(world_size, ndev_per_node, shard_size, pp_size=1):
+    """Generating the configuration for sharding This shard config generation assume
+    that the pipeline stages are partitioned in order, i.e., first ranks
+    hold the stage0, etc.
+
+    Args:
+
+        shard_size (int): zero3 data-parallel shard size, FIXME:
+        change the name later
+
+        pp_size (int): pipeline parallel size, currently, only work with
+        pipeline parallelism + zero
+
+    """
+    assert world_size % pp_size == 0
+    assert (world_size // pp_size) % shard_size == 0, \
+        f"dp group size is not dividable by dp_shard_size, "\
+        f" (world_size {world_size}, pp_size {pp_size}, dp_shard_size {shard_size})"
+
+    config = {}
+    shard_groups = np.arange(world_size).reshape(-1, shard_size)
+    replicate_groups = []
+    for i in range(shard_size):
+        same_shard_ranks = shard_groups[:, i].tolist()
+        n_ranks = len(same_shard_ranks)
+        replicate_size = n_ranks // pp_size
+        replicate_groups.extend([same_shard_ranks[j:j + replicate_size] for j in range(0, n_ranks, replicate_size)])
+
+    config['replicate_groups'] = replicate_groups
+    config['shard_groups'] = shard_groups.tolist()
+    config["span_nodes"] = len(shard_groups[0]) // ndev_per_node
+    return config
+
+
+def _sizes_all_same(groups):
+    """all groups have same length"""
+    all_same = True
+    for g in groups:
+        if len(g) != len(groups[0]):
+            return False
+    return all_same
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index 7ea76c68178d06cd0e906f284aae3893d9208559..c3a6dc7af53012397e36ccf630d821f1748d921c 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -1,8 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""
-Copyright (c) Microsoft Corporation
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from pydantic import Field, validator
 from enum import Enum
@@ -88,6 +87,7 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
 
     fast_init: bool = False
     """ Enable fast optimizer initialization when offloading to NVMe. """
+
     @validator("pipeline_read", "pipeline_write", always=True)
     def set_pipeline(cls, field_value, values):
         values["pipeline"] = field_value or values.get("pipeline", False)
diff --git a/deepspeed/runtime/zero/offload_constants.py b/deepspeed/runtime/zero/offload_constants.py
deleted file mode 100644
index 436e8bb8a4def16536d312d8ded846525bbb5ff3..0000000000000000000000000000000000000000
--- a/deepspeed/runtime/zero/offload_constants.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
-#########################################
-# TENSOR OFFLOADING
-#########################################
-OFFLOAD_NONE_DEVICE = "none"
-OFFLOAD_CPU_DEVICE = "cpu"
-OFFLOAD_NVME_DEVICE = "nvme"
-VALID_OFFLOAD_DEVICES = [OFFLOAD_NONE_DEVICE, OFFLOAD_CPU_DEVICE, OFFLOAD_NVME_DEVICE]
-
-#########################################
-# PARAM TENSOR OFFLOADING
-#########################################
-OFFLOAD_PARAM_FORMAT = '''
-"offload_param": {
-  "device": [none|cpu|nvme],
-  "nvme_path": "/local_nvme",
-  "buffer_count": 5,
-  "buffer_size": 1e8,
-  "max_in_cpu": 1e9,
-  "pin_memory": [true|false]
-}
-'''
-OFFLOAD_PARAM = "offload_param"
-OFFLOAD_PARAM_DEVICE = "device"
-OFFLOAD_PARAM_DEVICE_DEFAULT = None
-OFFLOAD_PARAM_NVME_PATH = "nvme_path"
-OFFLOAD_PARAM_NVME_PATH_DEFAULT = None
-OFFLOAD_PARAM_BUFFER_COUNT = "buffer_count"
-OFFLOAD_PARAM_BUFFER_COUNT_DEFAULT = 5
-OFFLOAD_PARAM_BUFFER_SIZE = "buffer_size"
-OFFLOAD_PARAM_BUFFER_SIZE_DEFAULT = 1e8
-OFFLOAD_PARAM_MAX_IN_CPU = "max_in_cpu"
-OFFLOAD_PARAM_MAX_IN_CPU_DEFAULT = 1e9
-OFFLOAD_PARAM_PIN_MEMORY = "pin_memory"
-OFFLOAD_PARAM_PIN_MEMORY_DEFAULT = False
-
-#########################################
-# OPTIMIZER TENSOR OFFLOADING
-#########################################
-OFFLOAD_OPTIMIZER_FORMAT = '''
-"offload_optimizer": {
-  "device": [none|cpu|nvme],
-  "nvme_path": "/local_nvme",
-  "buffer_count": 4,
-  "pin_memory": [true|false],
-  "pipeline_read": false,
-  "pipeline_write": false,
-  "fast_init": false
-}
-'''
-OFFLOAD_OPTIMIZER = "offload_optimizer"
-OFFLOAD_OPTIMIZER_DEVICE = "device"
-OFFLOAD_OPTIMIZER_DEVICE_DEFAULT = None
-OFFLOAD_OPTIMIZER_NVME_PATH = "nvme_path"
-OFFLOAD_OPTIMIZER_NVME_PATH_DEFAULT = None
-OFFLOAD_OPTIMIZER_BUFFER_COUNT = "buffer_count"
-OFFLOAD_OPTIMIZER_BUFFER_COUNT_DEFAULT = 4
-OFFLOAD_OPTIMIZER_PIN_MEMORY = "pin_memory"
-OFFLOAD_OPTIMIZER_PIN_MEMORY_DEFAULT = False
-OFFLOAD_OPTIMIZER_PIPELINE_READ = "pipeline_read"
-OFFLOAD_OPTIMIZER_PIPELINE_READ_DEFAULT = False
-OFFLOAD_OPTIMIZER_PIPELINE_WRITE = "pipeline_write"
-OFFLOAD_OPTIMIZER_PIPELINE_WRITE_DEFAULT = False
-OFFLOAD_OPTIMIZER_PIPELINE = "pipeline"
-OFFLOAD_OPTIMIZER_FAST_INIT = "fast_init"
-OFFLOAD_OPTIMIZER_FAST_INIT_DEFAULT = False
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index 336c63e987219860d15a76c3dcf68319324fc883..55beff336740bbe30815c05464022ab283ec1da5 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2022 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import torch
@@ -22,28 +22,53 @@ def is_builtin_type(obj):
     return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
 
 
+def isinstance_namedtuple(obj: object) -> bool:
+    """
+    Is this an instance of namedtuple/NamedTuple?
+    From: https://stackoverflow.com/a/62692640
+
+    Args:
+        obj (object): An object.
+
+    Returns:
+        bool: True if namedtuple/NamedTuple else False.
+    """
+    return isinstance(obj, tuple) and hasattr(obj, '_asdict') and hasattr(obj, '_fields')
+
+
 # ensure we only warn once, otherwise every iteration will trigger a warning
 warned = False
 
 
-#apply torch.autograd.Function that calls a backward_function to tensors in output
 def _apply_to_tensors_only(module, functional, backward_function, outputs):
+    """
+    Apply a torch.autograd.Function that calls a `backward_function` to every Tensor in `outputs`.
+
+    Args:
+        module (torch.nn.Module):  A torch module
+        functional (Type[torch.autograd.Function]): The function class to apply.
+        backward_function (Callable[[torch.nn.Module], None]): A backward_function to pass to
+            `functional.apply`.
+        outputs (Any): The output of `module`.
+
+    Returns:
+        Any: The output of `module`.
+    """
     if isinstance(outputs, (tuple, list)):
         touched_outputs = []
         for output in outputs:
-            touched_output = _apply_to_tensors_only(module,
-                                                    functional,
-                                                    backward_function,
-                                                    output)
+            touched_output = _apply_to_tensors_only(module, functional, backward_function, output)
             touched_outputs.append(touched_output)
+
+        if isinstance_namedtuple(outputs):
+            # namedtuples require a slightly different syntax.
+            return outputs.__class__(*touched_outputs)
+
         return outputs.__class__(touched_outputs)
     elif isinstance(outputs, dict):
         # apply inplace to avoid recreating dict inherited objects
         for key in outputs.keys():
-            outputs[key] = _apply_to_tensors_only(module,
-                                                  functional,
-                                                  backward_function,
-                                                  outputs[key])
+            outputs[key] = _apply_to_tensors_only(module, functional, backward_function, outputs[key])
         return outputs
 
     elif isinstance(outputs, torch.Tensor):
@@ -67,18 +92,12 @@ def _apply_to_tensors_only(module, functional, backward_function, outputs):
 
 
 #for each tensor in outputs run the forward_function and register backward_function as hook
-def _apply_forward_and_backward_to_tensors_only(module,
-                                                forward_function,
-                                                backward_function,
-                                                outputs):
+def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs):
     if type(outputs) is tuple:
         touched_outputs = []
         for output in outputs:
-            touched_output = _apply_forward_and_backward_to_tensors_only(
-                module,
-                forward_function,
-                backward_function,
-                output)
+            touched_output = _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function,
+                                                                         output)
             touched_outputs.append(touched_output)
         return tuple(touched_outputs)
     elif type(outputs) is torch.Tensor:
@@ -91,6 +110,7 @@ def _apply_forward_and_backward_to_tensors_only(module,
 
 
 class ZeROOrderedDict(OrderedDict):
+
     def __init__(self, parent_module, *args, **kwargs):
         """A replacement for ``collections.OrderedDict`` to detect external ZeRO params.
 
@@ -113,9 +133,7 @@ class ZeROOrderedDict(OrderedDict):
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
                 param.all_gather()
-                print_rank_0(
-                    f'Registering external parameter from getter {key} ds_id = {param.ds_id}',
-                    force=False)
+                print_rank_0(f'Registering external parameter from getter {key} ds_id = {param.ds_id}', force=False)
 
         return param
 
@@ -133,6 +151,7 @@ def _inject_parameters(module, cls):
 
 
 class PreBackwardFunction(torch.autograd.Function):
+
     @staticmethod
     def forward(ctx, module, pre_backward_function, outputs):
         ctx.module = module
@@ -152,6 +171,7 @@ class PreBackwardFunction(torch.autograd.Function):
 
 
 class PostBackwardFunction(torch.autograd.Function):
+
     @staticmethod
     def forward(ctx, module, pre_backward_function, output):
         ctx.module = module
@@ -179,6 +199,7 @@ class PostBackwardFunction(torch.autograd.Function):
 
 
 class DeepSpeedZeRoOffload(object):
+
     def __init__(self,
                  module,
                  timers,
@@ -194,8 +215,7 @@ class DeepSpeedZeRoOffload(object):
 
         see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True)
 
-        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
-                     force=False)
+        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False)
 
         self.module = module
         self.dtype = list(module.parameters())[0].dtype
@@ -215,16 +235,14 @@ class DeepSpeedZeRoOffload(object):
 
         self.param_numel_persistence_threshold = int(param_persistence_threshold)
         self.model_persistence_threshold = int(model_persistence_threshold)
-        self.persistent_parameters = self.mark_persistent_parameters(
-            self.param_numel_persistence_threshold,
-            self.model_persistence_threshold)
+        self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold,
+                                                                     self.model_persistence_threshold)
 
         self.param_coordinators = {}
         self._prefetch_bucket_sz = int(prefetch_bucket_size)
         self._max_reuse_distance_in_numel = int(max_reuse_distance)
         self._max_available_parameters_in_numel = int(max_live_parameters)
-        self.__allgather_stream = get_accelerator().Stream(
-        ) if overlap_comm else get_accelerator().default_stream()
+        self.__allgather_stream = get_accelerator().Stream() if overlap_comm else get_accelerator().default_stream()
 
         self.forward_hooks = []
         self.backward_hooks = []
@@ -240,8 +258,7 @@ class DeepSpeedZeRoOffload(object):
         """Partitioning Parameters that were not partitioned usually if parameters
         of modules whose input parameters do not require grad computation do not
         trigger post call and will therefore will remain unpartitioned"""
-        self.get_param_coordinator(training=self.module.training).release_and_reset_all(
-            self.module)
+        self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module)
         for param in iter_params(self.module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
@@ -251,14 +268,16 @@ class DeepSpeedZeRoOffload(object):
             self.param_coordinators[training] = PartitionedParameterCoordinator(
                 prefetch_bucket_sz=self._prefetch_bucket_sz,
                 max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
-                max_available_parameters_in_numel=self.
-                _max_available_parameters_in_numel,
+                max_available_parameters_in_numel=self._max_available_parameters_in_numel,
                 allgather_stream=self.__allgather_stream,
                 prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
             )
 
         return self.param_coordinators[training]
 
+    def empty_partition_cache(self):
+        self.partition_all_parameters()
+
     def _convert_to_zero_parameters(self, ds_config, module, mpu):
         non_zero_params = [p for p in module.parameters() if not is_zero_param(p)]
         if non_zero_params:
@@ -291,9 +310,8 @@ class DeepSpeedZeRoOffload(object):
         for hook in self.backward_hooks:
             hook.remove()
 
-        print_rank_0(
-            f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}',
-            force=False)
+        print_rank_0(f'Deleted module hooks: forward = {num_forward_hooks}, backward = {num_backward_hooks}',
+                     force=False)
 
     def setup_zero_stage3_hooks(self):
         self.hierarchy = 0
@@ -321,7 +339,7 @@ class DeepSpeedZeRoOffload(object):
             if param.ds_numel + total_persistent_parameters > model_threshold:
                 continue
 
-            if param.ds_numel < param_threshold:
+            if param.ds_numel <= param_threshold:
                 params_count += 1
                 param.ds_persist = True
                 persistent_params.append(param)
@@ -365,19 +383,14 @@ class DeepSpeedZeRoOffload(object):
                             outputs.append(val)
                     output = outputs
 
-            for item in filter(
-                    lambda item: is_zero_param(item) or hasattr(item,
-                                                                'ds_param_alias'),
-                    output):
+            for item in filter(lambda item: is_zero_param(item) or hasattr(item, 'ds_param_alias'), output):
                 key = id(item) if hasattr(item, 'ds_id') else id(item.ds_param_alias)
-                actual_external_param = item if hasattr(item,
-                                                        'ds_id') else item.ds_param_alias
+                actual_external_param = item if hasattr(item, 'ds_id') else item.ds_param_alias
 
                 if not any(key in m._external_params for m in FWD_MODULE_STACK):
                     actual_external_param.is_external_param = True
                     module_to_register = FWD_MODULE_STACK[-1]
-                    register_external_parameter(module_to_register,
-                                                actual_external_param)
+                    register_external_parameter(module_to_register, actual_external_param)
                     print_rank_0(
                         f'Registering dangling parameter for module {module_to_register.__class__.__name__}, ds_id = {actual_external_param.ds_id}.',
                         force=False)
@@ -395,6 +408,7 @@ class DeepSpeedZeRoOffload(object):
             self.post_sub_module_forward_function(module)
 
         def _pre_backward_module_hook(module, inputs, output):
+
             @instrument_w_nvtx
             def _run_before_backward_function(sub_module):
                 # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
@@ -406,10 +420,7 @@ class DeepSpeedZeRoOffload(object):
                     sub_module.applied_pre_backward_ref_cnt -= 1
                 #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
 
-            return _apply_to_tensors_only(module,
-                                          PreBackwardFunction,
-                                          _run_before_backward_function,
-                                          output)
+            return _apply_to_tensors_only(module, PreBackwardFunction, _run_before_backward_function, output)
 
         #This is an alternate to doing _post_backward_module_hook
         #it uses tensor.register_hook instead of using torch.autograd.Function
@@ -428,11 +439,8 @@ class DeepSpeedZeRoOffload(object):
                 if input.requires_grad:
                     module.ds_grads_remaining += 1
 
-            return _apply_forward_and_backward_to_tensors_only(
-                module,
-                _run_before_forward_function,
-                _run_after_backward_hook,
-                inputs)
+            return _apply_forward_and_backward_to_tensors_only(module, _run_before_forward_function,
+                                                               _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
             module.ds_grads_remaining = 0
@@ -442,31 +450,23 @@ class DeepSpeedZeRoOffload(object):
                 if sub_module.ds_grads_remaining == 0:
                     self.post_sub_module_backward_function(sub_module)
 
-            return _apply_to_tensors_only(module,
-                                          PostBackwardFunction,
-                                          _run_after_backward_function,
-                                          inputs)
+            return _apply_to_tensors_only(module, PostBackwardFunction, _run_after_backward_function, inputs)
 
         # Pre forward hook
-        self.forward_hooks.append(
-            module.register_forward_pre_hook(_pre_forward_module_hook))
+        self.forward_hooks.append(module.register_forward_pre_hook(_pre_forward_module_hook))
 
         # Post forward hook
-        self.forward_hooks.append(
-            module.register_forward_hook(_post_forward_module_hook))
+        self.forward_hooks.append(module.register_forward_hook(_post_forward_module_hook))
 
         # Pre backward hook
-        self.backward_hooks.append(
-            module.register_forward_hook(_pre_backward_module_hook))
+        self.backward_hooks.append(module.register_forward_hook(_pre_backward_module_hook))
 
         # post backward hook
-        self.backward_hooks.append(
-            module.register_forward_pre_hook(_post_backward_module_hook))
+        self.backward_hooks.append(module.register_forward_pre_hook(_post_backward_module_hook))
 
     @torch.no_grad()
     def pre_sub_module_forward_function(self, sub_module):
-        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}",
-                         force=False)
+        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}", force=False)
 
         global FWD_MODULE_STACK
         FWD_MODULE_STACK.append(sub_module)
@@ -477,26 +477,23 @@ class DeepSpeedZeRoOffload(object):
             param_coordinator.record_module(sub_module)
         param_coordinator.fetch_sub_module(sub_module)
 
-        see_memory_usage(
-            f"Before sub module function {sub_module.__class__.__name__} after fetch",
-            force=False)
+        see_memory_usage(f"Before sub module function {sub_module.__class__.__name__} after fetch", force=False)
 
     @torch.no_grad()
     def post_sub_module_forward_function(self, sub_module):
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
-            force=False)
+        see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
+                         force=False)
 
         param_coordinator = self.get_param_coordinator(training=sub_module.training)
         param_coordinator.release_sub_module(sub_module)
 
-        see_memory_usage(
-            f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
-            force=False)
+        see_memory_usage(f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
+                         force=False)
 
     @torch.no_grad()
     def pre_sub_module_backward_function(self, sub_module):
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        param_coordinator = self.get_param_coordinator(training=True)
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
@@ -504,12 +501,12 @@ class DeepSpeedZeRoOffload(object):
 
     @torch.no_grad()
     def post_sub_module_backward_function(self, sub_module):
+        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
             force=False)
 
-        self.get_param_coordinator(
-            training=sub_module.training).release_sub_module(sub_module)
+        self.get_param_coordinator(training=True).release_sub_module(sub_module)
 
         see_memory_usage(
             f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index 39cfb7f90202cc760fef64aefe3e0c054c6456e3..431454db0cc9b4c3b75bcd7938bbb6a3c93dddcb 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 import os
@@ -11,7 +11,6 @@ from enum import Enum
 import functools
 import itertools
 from typing import List
-
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
@@ -22,28 +21,65 @@ from .linear import zero3_linear_wrap
 
 import deepspeed
 from ..utils import get_only_unique_item, see_memory_usage
+from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
 from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.config_utils import get_config_default
 from deepspeed.utils import instrument_w_nvtx, logger
 from deepspeed.comm.comm import init_distributed
-from deepspeed.utils.debug import (debug_param2name_id_shape,
-                                   debug_param2name_id_shape_device,
-                                   debug_module2name,
-                                   debug_param2name_id,
-                                   debug_param2name_id_shape_status)
+from deepspeed.utils.debug import (debug_param2name_id_shape, debug_param2name_id_shape_device, debug_module2name,
+                                   debug_param2name_id, debug_param2name_id_shape_status)
 from deepspeed.accelerator import get_accelerator
 from ..swap_tensor.partitioned_param_swapper import AsyncPartitionedParameterSwapper, PartitionedParamStatus
 
 param_count = 0
 partitioned_param_data_shape = [0]
-zero_init_enabled = False
+zero_init_context = []
+all_wrapped_classes = set()
+
+
+class NoGatherHandle:
+
+    def __init__(self, param: Parameter) -> None:
+        if param.ds_status != ZeroParamStatus.INFLIGHT:
+            raise RuntimeError(f"expected param {param.ds_summary()} to be available")
+
+        param.data = param.ds_tensor.data.to(device=get_accelerator().current_device_name(),
+                                             non_blocking=True).view(param.ds_shape)
+        self.__param = param
+
+    def wait(self) -> None:
+        get_accelerator().current_stream().synchronize()
+        self.__param.ds_status = ZeroParamStatus.AVAILABLE
+
+
+class NoGatherCoalescedHandle:
+
+    def __init__(self, params: List[Parameter]) -> None:
+        self.__params = params
+        self.__complete = False
+
+        for param in self.__params:
+            if param.ds_status != ZeroParamStatus.INFLIGHT:
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
+            param.data = param.ds_tensor.data.to(device=get_accelerator().current_device_name(),
+                                                 non_blocking=True).view(param.ds_shape)
+
+    @instrument_w_nvtx
+    def wait(self) -> None:
+        if self.__complete:
+            return
+
+        get_accelerator().current_stream().synchronize()
+        for param in self.__params:
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.__complete = True
 
 
 def _dist_allgather_fn(input_tensor: Tensor, output_tensor: Tensor, group=None):
-    return instrument_w_nvtx(dist.allgather_fn)(output_tensor,
-                                                input_tensor,
-                                                group=group,
-                                                async_op=True)
+    return instrument_w_nvtx(dist.allgather_fn)(output_tensor, input_tensor, group=group, async_op=True)
 
 
 def print_rank_0(message, debug=False, force=False):
@@ -76,9 +112,7 @@ def _init_external_params(module):
             return self._external_params.items()
 
         def all_parameters(self):
-            return itertools.chain(self.named_parameters(self,
-                                                         recurse=False),
-                                   external_parameters(self))
+            return itertools.chain(self.named_parameters(self, recurse=False), external_parameters(self))
 
         module.ds_external_parameters = types.MethodType(external_parameters, module)
         module.all_parameters = types.MethodType(all_parameters, module)
@@ -150,8 +184,7 @@ def unregister_external_parameter(module, parameter):
     if not isinstance(parameter, torch.nn.Parameter):
         raise RuntimeError('Parameter is not a torch.nn.Parameter')
 
-    if not hasattr(module,
-                   '_external_params') or id(parameter) not in module._external_params:
+    if not hasattr(module, '_external_params') or id(parameter) not in module._external_params:
         raise RuntimeError('Parameter is not a registered external parameter of module.')
 
     key = id(parameter)
@@ -188,12 +221,11 @@ _orig_torch_ones = torch.ones
 _orig_torch_full = torch.full
 
 
-def zero_wrapper_for_fp_tensor_constructor(fn: Callable,
-                                           target_fp_dtype: torch.dtype) -> Callable:
+def zero_wrapper_for_fp_tensor_constructor(fn: Callable, target_fp_dtype: torch.dtype) -> Callable:
+
     def wrapped_fn(*args, **kwargs) -> Tensor:
         if kwargs.get("device", None) is None:
-            kwargs['device'] = torch.device(get_accelerator().device_name(
-                os.environ["LOCAL_RANK"]))
+            kwargs['device'] = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         tensor: Tensor = fn(*args, **kwargs)
         if tensor.is_floating_point():
             tensor = tensor.to(target_fp_dtype)
@@ -204,6 +236,7 @@ def zero_wrapper_for_fp_tensor_constructor(fn: Callable,
 
 
 def get_new_tensor_fn_for_dtype(dtype: torch.dtype) -> Callable:
+
     def new_tensor(cls, *args) -> Tensor:
         device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         tensor = _orig_torch_empty(0, device=device).new_empty(*args)
@@ -250,21 +283,19 @@ empty_buffers = {}
 # Inserts _post_init_method at the end of init method
 # for all sub classes of torch.nn.Module
 class InsertPostInitMethodToModuleSubClasses(object):
-    def __init__(self,
-                 enabled=True,
-                 mem_efficient_linear=True,
-                 ds_config=None,
-                 dtype=None):
+
+    def __init__(self, enabled=True, mem_efficient_linear=True, ds_config=None, dtype=None):
         self.mem_efficient_linear = mem_efficient_linear
         self.enabled = enabled
         self._set_dtype(ds_config, dtype)
-        assert self.dtype in [torch.half, torch.bfloat16, torch.float], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
+        assert self.dtype in [
+            torch.half, torch.bfloat16, torch.float
+        ], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
+        self.wrapped_cls = set()
 
     def __enter__(self):
-        global zero_init_enabled
         if not self.enabled:
             return
-        zero_init_enabled = True
 
         def apply_with_gather(orig_module_apply_fn: Callable) -> Callable:
             """many models make use of child modules like Linear or Embedding which
@@ -280,6 +311,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
             to get around this issue, we wrap the function passed to Module.apply
             so that the applied function is applied to child modules correctly.
             """
+
             def get_wrapped_fn_to_apply(fn_to_apply: Callable) -> Callable:
                 if hasattr(fn_to_apply, "wrapped"):
                     return fn_to_apply
@@ -296,19 +328,14 @@ class InsertPostInitMethodToModuleSubClasses(object):
                     3. broadcasts root rank's parameters to the other ranks
                     4. re-partitions the parameters
                     """
-                    if not all(
-                            is_zero_param(p)
-                            for p in module_to_apply_fn_to.parameters(recurse=False)):
-                        raise RuntimeError(
-                            f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, "
-                            f"were zero params, is it possible that the parameters were "
-                            f"overwritten after they were initialized? "
-                            f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} "
-                        )
+                    if not all(is_zero_param(p) for p in module_to_apply_fn_to.parameters(recurse=False)):
+                        raise RuntimeError(f"not all parameters for {module_to_apply_fn_to.__class__.__name__}, "
+                                           f"were zero params, is it possible that the parameters were "
+                                           f"overwritten after they were initialized? "
+                                           f"params: {[p for p in module_to_apply_fn_to.parameters(recurse=False)]} ")
 
                     params_to_apply_fn_to: Iterable[Parameter] = list(
-                        sorted(module_to_apply_fn_to.parameters(recurse=False),
-                               key=lambda p: p.ds_id))
+                        sorted(module_to_apply_fn_to.parameters(recurse=False), key=lambda p: p.ds_id))
 
                     for param in params_to_apply_fn_to:
                         param.all_gather()
@@ -332,6 +359,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
             return wrapped_apply
 
         def partition_after(f):
+
             @functools.wraps(f)
             def wrapper(module, *args, **kwargs):
 
@@ -343,8 +371,7 @@ class InsertPostInitMethodToModuleSubClasses(object):
                 # custom weights init function. So if a parent created the weights param, the child
                 # won't need to gather it in order to tweak it
 
-                print_rank_0(f'Before initializing {module.__class__.__name__}',
-                             force=False)
+                print_rank_0(f'Before initializing {module.__class__.__name__}', force=False)
 
                 is_child_module = False
                 if not hasattr(module, "_ds_child_entered"):
@@ -358,13 +385,10 @@ class InsertPostInitMethodToModuleSubClasses(object):
                     # child's __init__ is done, now we can run a single post_init on the child object
                     delattr(module, "_ds_child_entered")
 
-                    print_rank_0(f'Running post_init for {module.__class__.__name__}',
-                                 force=False)
+                    print_rank_0(f'Running post_init for {module.__class__.__name__}', force=False)
                     self._post_init_method(module)
 
-                print_rank_0(
-                    f'After initializing followed by post init for {module.__class__.__name__}',
-                    force=False)
+                print_rank_0(f'After initializing followed by post init for {module.__class__.__name__}', force=False)
 
             return wrapper
 
@@ -376,44 +400,58 @@ class InsertPostInitMethodToModuleSubClasses(object):
             cls.__init__ = partition_after(cls.__init__)
 
         # Replace .__init__() for all existing subclasses of torch.nn.Module recursively
+        global zero_init_context
+        self.nest_level = len(zero_init_context)
+
+        global all_wrapped_classes
         for subclass in get_all_subclasses(torch.nn.modules.module.Module):
-            # print(f"subclass={subclass.__module__}.{subclass.__qualname__}")
-            _enable_class(subclass)
-
-        # holding onto some methods so we can put them back the way they were in __exit__
-        torch.nn.modules.module.Module._old_init_subclass = torch.nn.modules.module.Module.__init_subclass__
-        torch.nn.modules.module.Module._old_apply = torch.nn.modules.module.Module.apply
-        torch.Tensor.__old_new__ = torch.Tensor.__new__
-
-        # Replace .__init__() for future subclasses of torch.nn.Module
-        torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)
-        torch.nn.modules.module.Module.apply = apply_with_gather(
-            torch.nn.modules.module.Module._old_apply)
-
-        torch.Tensor.__new__ = get_new_tensor_fn_for_dtype(self.dtype)
-        torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty,
-                                                             self.dtype)
-        torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros,
-                                                             self.dtype)
-        torch.ones = zero_wrapper_for_fp_tensor_constructor(_orig_torch_ones, self.dtype)
-        torch.full = zero_wrapper_for_fp_tensor_constructor(_orig_torch_full, self.dtype)
-
-        if self.mem_efficient_linear:
-            print_rank_0(
-                "nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
-                force=False)
-            self.linear_bk = torch.nn.functional.linear
-            torch.nn.functional.linear = zero3_linear_wrap
+            # Only wrap classes that haven't been wrapped yet
+            if subclass not in all_wrapped_classes:
+                _enable_class(subclass)
+                self.wrapped_cls.add(subclass)
+
+        all_wrapped_classes = all_wrapped_classes.union(self.wrapped_cls)
+
+        # Wrap some functions only at top level call of Init
+        if self.nest_level == 0:
+            # holding onto some methods so we can put them back the way they were in __exit__
+            torch.nn.modules.module.Module._old_init_subclass = torch.nn.modules.module.Module.__init_subclass__
+            torch.nn.modules.module.Module._old_apply = torch.nn.modules.module.Module.apply
+            torch.Tensor.__old_new__ = torch.Tensor.__new__
+
+            # Replace .__init__() for future subclasses of torch.nn.Module
+            torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)
+            torch.nn.modules.module.Module.apply = apply_with_gather(torch.nn.modules.module.Module._old_apply)
+
+            torch.Tensor.__new__ = get_new_tensor_fn_for_dtype(self.dtype)
+            torch.empty = zero_wrapper_for_fp_tensor_constructor(_orig_torch_empty, self.dtype)
+            torch.zeros = zero_wrapper_for_fp_tensor_constructor(_orig_torch_zeros, self.dtype)
+            torch.ones = zero_wrapper_for_fp_tensor_constructor(_orig_torch_ones, self.dtype)
+            torch.full = zero_wrapper_for_fp_tensor_constructor(_orig_torch_full, self.dtype)
+
+            if self.mem_efficient_linear:
+                print_rank_0(
+                    "nn.functional.linear has been overridden with a more memory efficient version. This will persist unless manually reset.",
+                    force=False)
+                self.linear_bk = torch.nn.functional.linear
+                torch.nn.functional.linear = zero3_linear_wrap
+
+            self.torch_func_wrapped = True
+
+        zero_init_context.append(self)
 
     def __exit__(self, exc_type, exc_value, traceback):
         if not self.enabled:
             return
 
-        shutdown_init_context()
+        self.remove_wrappers()
 
-        if dist.get_rank() == 0:
-            logger.info("finished initializing model with %.2fB parameters",
-                        param_count / 1e9)
+        # Exiting the top level context
+        global zero_init_context
+        zero_init_context.pop()
+        if self.nest_level == 0:
+            if dist.get_rank() == 0:
+                logger.info("finished initializing model with %.2fB parameters", param_count / 1e9)
 
         # Now that we cleaned up the metaclass injection, raise the exception.
         if exc_type is not None:
@@ -437,53 +475,69 @@ class InsertPostInitMethodToModuleSubClasses(object):
         else:
             self.dtype = dtype or torch.half
 
+    def remove_wrappers(self):
 
-def shutdown_init_context():
-    global zero_init_enabled
+        def _disable_class(cls):
+            cls.__init__ = cls._old_init
 
-    if not zero_init_enabled:
-        return
+        for subclass in self.wrapped_cls:
+            _disable_class(subclass)
+        self.wrapped_cls.clear()
 
-    def _disable_class(cls):
-        cls.__init__ = cls._old_init
+        # This context is the top level of nested Init
+        if self.nest_level == 0 and self.torch_func_wrapped:
+            # putting methods back the way we found them
+            torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
+            torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
 
-    # Replace .__init__() for all existing subclasses of torch.nn.Module
-    for subclass in get_all_subclasses(torch.nn.modules.module.Module):
-        _disable_class(subclass)
+            torch.Tensor.__new__ = torch.Tensor.__old_new__
+            torch.empty = _orig_torch_empty
+            torch.zeros = _orig_torch_zeros
+            torch.ones = _orig_torch_ones
+            torch.full = _orig_torch_full
 
-    # putting methods back the way we found them
-    torch.nn.modules.module.Module.__init_subclass__ = torch.nn.modules.module.Module._old_init_subclass
-    torch.nn.modules.module.Module.apply = torch.nn.modules.module.Module._old_apply
+            # un doing it here will undo it during training
+            # if self.mem_efficient_linear:
+            #    torch.nn.functional.linear = self.linear_bk
+            #        if self.mem_efficient_linear:
+            #            torch.nn.functional.linear = self.linear_bk
 
-    torch.Tensor.__new__ = torch.Tensor.__old_new__
-    torch.empty = _orig_torch_empty
-    torch.zeros = _orig_torch_zeros
-    torch.ones = _orig_torch_ones
-    torch.full = _orig_torch_full
+            self.torch_func_wrapped = False
 
-    # un doing it here will undo it during training
-    # if self.mem_efficient_linear:
-    #    torch.nn.functional.linear = self.linear_bk
-    #        if self.mem_efficient_linear:
-    #            torch.nn.functional.linear = self.linear_bk
+            global all_wrapped_classes
+            for subclass in get_all_subclasses(torch.nn.modules.module.Module):
+                if subclass not in all_wrapped_classes:
+                    msg = f"`{subclass}' was not properly set up for sharding by zero.Init(). A subclass of torch.nn.Module must be defined before zero.Init() where an instance of the class is created."
+                    raise RuntimeError(msg)
+            all_wrapped_classes.clear()
 
-    zero_init_enabled = False
+
+def shutdown_init_context():
+    """
+    This function is used to initialize deepspeed engine inside the context of Init.
+    We need to remove the wrappers but keep the list of contexts.
+    """
+    global zero_init_context
+    for ctx in zero_init_context:
+        ctx.remove_wrappers()
 
 
 class AllGatherHandle:
+
     def __init__(self, handle, param: Parameter) -> None:
         if param.ds_status != ZeroParamStatus.INFLIGHT:
             raise RuntimeError(f"expected param {param.ds_summary()} to be available")
 
-        self.__handle = handle
-        self.__param = param
+        self.handle = handle
+        self.param = param
 
     def wait(self) -> None:
-        instrument_w_nvtx(self.__handle.wait)()
-        self.__param.ds_status = ZeroParamStatus.AVAILABLE
+        instrument_w_nvtx(self.handle.wait)()
+        self.param.ds_status = ZeroParamStatus.AVAILABLE
 
 
 class AllGatherCoalescedHandle:
+
     def __init__(
         self,
         allgather_handle,
@@ -491,39 +545,36 @@ class AllGatherCoalescedHandle:
         partitions: List[Tensor],
         world_size: int,
     ) -> None:
-        self.__allgather_handle = allgather_handle
-        self.__params = params
-        self.__partitions = partitions
-        self.__world_size = world_size
-        self.__complete = False
-
-        for param in self.__params:
+        # renaming the fields without double underscore to ease
+        # the class inheritance
+        self.allgather_handle = allgather_handle
+        self.params = params
+        self.partitions = partitions
+        self.world_size = world_size
+        self.complete = False
+
+        for param in self.params:
             if param.ds_status != ZeroParamStatus.INFLIGHT:
-                raise RuntimeError(
-                    f"expected param {param.ds_summary()} to not be available")
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
 
     @instrument_w_nvtx
     def wait(self) -> None:
-        if self.__complete:
+        if self.complete:
             return
 
-        instrument_w_nvtx(self.__allgather_handle.wait)()
+        instrument_w_nvtx(self.allgather_handle.wait)()
 
         # split the single tensor out into individual tensors
         param_offset = 0
-        for param in self.__params:
+        for param in self.params:
             assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
             partitions: List[Tensor] = []
-            for rank in range(self.__world_size):
+            for rank in range(self.world_size):
                 param_start = rank * param.ds_tensor.ds_numel
                 if param_start < param.ds_numel:
-                    part_to_copy = self.__partitions[rank].narrow(
-                        0,
-                        param_offset,
-                        min(param.ds_numel - param_start,
-                            param.ds_tensor.ds_numel))
+                    part_to_copy = self.partitions[rank].narrow(
+                        0, param_offset, min(param.ds_numel - param_start, param.ds_tensor.ds_numel))
                     partitions.append(part_to_copy)
-
             param.data = instrument_w_nvtx(torch.cat)(partitions).view(param.ds_shape)
             param.ds_status = ZeroParamStatus.AVAILABLE
 
@@ -532,12 +583,30 @@ class AllGatherCoalescedHandle:
 
             param_offset += param.ds_tensor.ds_numel
 
-        self.__complete = True
+        self.complete = True
+
+
+def _no_gather_coalesced(params: Iterable[Parameter]) -> AllGatherCoalescedHandle:
+    for param in params:
+        if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
+            raise RuntimeError(param.ds_summary())
+        param.ds_status = ZeroParamStatus.INFLIGHT
+
+    params = sorted(params, key=lambda p: p.ds_id)
+    if len(params) == 1:
+        param, = params
+        return NoGatherHandle(param)
+    return NoGatherCoalescedHandle(params)
 
 
 # Replaces all parameters in module with Scattered Parameters
 class Init(InsertPostInitMethodToModuleSubClasses):
     param_id = 0
+    param_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "param_persistence_threshold")
+    model_persistence_threshold = get_config_default(DeepSpeedZeroConfig, "model_persistence_threshold")
+    num_persisted_parameters = 0
+    num_persisted_elements = 0
+    apply_param_persistence = False
 
     def __init__(self,
                  module=None,
@@ -651,16 +720,12 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         if config is not None:
             config_dict_or_path = config
             logger.warning(
-                f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.'
-            )
-
-        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(
-            config_dict_or_path,
-            mpu) if config_dict_or_path is not None else None
-        super().__init__(enabled=enabled,
-                         mem_efficient_linear=mem_efficient_linear,
-                         ds_config=_ds_config,
-                         dtype=dtype)
+                f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.')
+        _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
+                                                              mpu) if config_dict_or_path is not None else None
+        if _ds_config is not None:
+            mem_efficient_linear = _ds_config.zero_config.memory_efficient_linear
+        super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, ds_config=_ds_config, dtype=dtype)
         if not dist.is_initialized():
             init_distributed()
             assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
@@ -670,29 +735,27 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             self.ds_process_group = data_parallel_group
 
         self.rank = dist.get_rank(group=self.ds_process_group)
-        self.world_size = dist.get_world_size(group=self.ds_process_group)
+        self.dp_world_size = dist.get_world_size(group=self.ds_process_group)
 
         # Local device is the device where the parameters are consumed, must be default device.
         # It is the device where parameters are fully instantiated using allgather
-        self.local_device = torch.device(get_accelerator().device_name(
-            os.environ["LOCAL_RANK"]))
+        self.local_device = torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"]))
         get_accelerator().set_device(self.local_device)
 
-        if _ds_config is not None and _ds_config.zero_config.offload_param is not None:
-            remote_device = _ds_config.zero_config.offload_param.device
-            pin_memory = _ds_config.zero_config.offload_param.pin_memory
+        if _ds_config is not None:
+            self._update_persist_config(_ds_config)
+
+            if _ds_config.zero_config.offload_param is not None:
+                remote_device = _ds_config.zero_config.offload_param.device
+                pin_memory = _ds_config.zero_config.offload_param.pin_memory
 
         self._validate_remote_device(remote_device, _ds_config)
 
         # Remote device is the device where parameter partitions are stored
         # It can be same as local_device or it could be CPU or NVMe.
-        self.remote_device = self.local_device if remote_device in [
-            None,
-            OffloadDeviceEnum.none
-        ] else remote_device
-        self.pin_memory = pin_memory if (
-            self.remote_device in [OffloadDeviceEnum.cpu,
-                                   OffloadDeviceEnum.nvme]) else False
+        self.remote_device = self.local_device if remote_device in [None, OffloadDeviceEnum.none] else remote_device
+        self.pin_memory = pin_memory if (self.remote_device in [OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme
+                                                                ]) else False
 
         # Enable fp16 param swapping to NVMe
         if self.remote_device == OffloadDeviceEnum.nvme:
@@ -705,12 +768,14 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             assert isinstance(module, torch.nn.Module)
             self._convert_to_zero_parameters(module.parameters(recurse=True))
 
-        self.use_all_gather_base = False
-        if dist.has_allgather_base():
-            self.use_all_gather_base = True
-        else:
-            logger.info(
-                f"_all_gather_base API is not available in torch {torch.__version__}")
+        self.use_all_gather_into_tensor = dist.has_all_gather_into_tensor()
+        if not self.use_all_gather_into_tensor:
+            logger.info(f"all_gather_into_tensor API is not available in torch {torch.__version__}")
+
+    def _update_persist_config(self, ds_config):
+        Init.apply_param_persistence = True
+        Init.param_persistence_threshold = ds_config.zero_config.param_persistence_threshold
+        Init.model_persistence_threshold = ds_config.zero_config.model_persistence_threshold // self.num_partitions
 
     def _convert_to_zero_parameters(self, param_list):
         for param in param_list:
@@ -737,9 +802,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
     def _post_init_method(self, module):
         #see_memory_usage(f"Before converting parmas in {module.__class__.__name__}", force=False)
         print_rank_0(f'Converting Params in {module.__class__.__name__}', force=False)
-        see_memory_usage(
-            f"Before converting and partitioning parmas in {module.__class__.__name__}",
-            force=False)
+        see_memory_usage(f"Before converting and partitioning parmas in {module.__class__.__name__}", force=False)
 
         global param_count
         for name, param in module.named_parameters(recurse=False):
@@ -747,11 +810,10 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             if not is_zero_param(param):
                 self._convert_to_deepspeed_param(param)
                 print_rank_0(
-                    f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}"
-                )
+                    f"Partitioning param {debug_param2name_id_shape(param)} module={debug_module2name(module)}")
 
                 if get_accelerator().on_accelerator(param):
-                    dist.broadcast(param, 0, self.ds_process_group)
+                    dist.broadcast(param, 0, self.get_dp_process_group())
                 else:
                     if dist.get_rank() == 0:
                         logger.warn(f"param `{name}` in {module.__class__.__name__} "
@@ -784,7 +846,12 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
         # If this flag is true, then the parameters are replicated throughput training
         # And only partitioned before the step
-        param.ds_persist = False
+        if Init.apply_param_persistence and param.ds_numel <= Init.param_persistence_threshold and Init.num_persisted_elements + param.ds_numel <= Init.model_persistence_threshold:
+            param.ds_persist = True
+            Init.num_persisted_parameters += 1
+            Init.num_persisted_elements += param.ds_numel
+        else:
+            param.ds_persist = False
 
         param.is_external_param = False
 
@@ -806,12 +873,14 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
 
         @instrument_w_nvtx
-        def all_gather_coalesced(params: Iterable[Parameter],
-                                 safe_mode: bool = False) -> AllGatherCoalescedHandle:
+        def all_gather_coalesced(params: Iterable[Parameter], safe_mode: bool = False) -> AllGatherCoalescedHandle:
 
             # fetches from nvme if the partition is not available and in nvme
             self._ensure_availability_of_partitioned_params(params)
 
+            if self.num_partitions == 1:
+                return _no_gather_coalesced(params)
+
             for param in params:
                 if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                     raise RuntimeError(param.ds_summary())
@@ -840,55 +909,40 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 # have an opportunity to avoid some intermediate memory allocations
                 param, = params
                 param_buffer = torch.empty(
-                    math.ceil(param.ds_numel / self.world_size) * self.world_size,
+                    math.ceil(param.ds_numel / self.num_partitions) * self.num_partitions,
                     dtype=param.dtype,
                     device=get_accelerator().current_device_name(),
                     requires_grad=False,
                 )
-                handle = _dist_allgather_fn(
-                    param.ds_tensor.to(get_accelerator().current_device_name()),
-                    param_buffer,
-                    self.ds_process_group)
-                param.data = param_buffer.narrow(0,
-                                                 0,
-                                                 param.ds_numel).view(param.ds_shape).to(
-                                                     param.device)
+                handle = _dist_allgather_fn(param.ds_tensor.to(get_accelerator().current_device_name()), param_buffer,
+                                            self.get_partition_dp_group(param))
+                param.data = param_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape).to(param.device)
                 return AllGatherHandle(handle, param)
             else:
                 partition_sz = sum(p.ds_tensor.ds_numel for p in params)
-                flat_tensor = torch.empty(partition_sz * self.world_size,
-                                          dtype=get_only_unique_item(p.dtype
-                                                                     for p in params),
+                flat_tensor = torch.empty(partition_sz * self.num_partitions,
+                                          dtype=get_only_unique_item(p.dtype for p in params),
                                           device=get_accelerator().current_device_name(),
                                           requires_grad=False)
                 partitions: List[Parameter] = []
-                for i in range(self.world_size):
-                    partitions.append(
-                        flat_tensor.narrow(0,
-                                           partition_sz * i,
-                                           partition_sz))
-
-                instrument_w_nvtx(torch.cat)([
-                    p.ds_tensor.to(get_accelerator().current_device_name())
-                    for p in params
-                ],
-                                             out=partitions[self.rank])
-                handle = _dist_allgather_fn(partitions[self.rank],
-                                            flat_tensor,
-                                            self.ds_process_group)
+                for i in range(self.num_partitions):
+                    partitions.append(flat_tensor.narrow(0, partition_sz * i, partition_sz))
+
+                instrument_w_nvtx(torch.cat)([p.ds_tensor.to(get_accelerator().current_device_name()) for p in params],
+                                             out=partitions[self.get_partition_rank()])
+                handle = _dist_allgather_fn(partitions[self.get_partition_rank()], flat_tensor,
+                                            self.get_partition_dp_group(params[0]))
 
                 return AllGatherCoalescedHandle(
                     allgather_handle=handle,
                     params=params,
                     partitions=partitions,
-                    world_size=self.world_size,
+                    world_size=self.num_partitions,
                 )
 
         def partition(param_list=None, hierarchy=0, has_been_updated=False):
             cls = param
-            print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}"
-            )
+            print_rank_0(f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}")
             if param_list is None:
                 param_list = [cls]
             self._partition(param_list, has_been_updated=has_been_updated)
@@ -902,22 +956,16 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             )
             self._reduce_scatter_gradients(param_list)
 
-        def partition_gradients(param_list=None,
-                                partition_buffers=None,
-                                hierarchy=0,
-                                accumulate=False):
+        def partition_gradients(param_list=None, partition_buffers=None, hierarchy=0, accumulate=False):
             cls = param
             print_rank_0(
-                f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}"
-            )
+                f"{'--'*hierarchy}----Partitioning param gradient with id {debug_param2name_id_shape_device(cls)}")
             if param_list is None:
                 param_list = [cls]
                 if isinstance(partition_buffers, torch.Tensor):
                     partition_buffers = [partition_buffers]
 
-            self._partition_gradients(param_list,
-                                      partition_buffers=partition_buffers,
-                                      accumulate=accumulate)
+            self._partition_gradients(param_list, partition_buffers=partition_buffers, accumulate=accumulate)
 
         def aligned_size():
             return self._aligned_size(param)
@@ -944,12 +992,14 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 "grad_shape": tuple(slf.grad.shape) if slf.grad is not None else None,
                 "persist": slf.ds_persist,
                 "active_sub_modules": slf.ds_active_sub_modules,
+                "ds_tensor.shape": slf.ds_tensor.shape if slf.ds_tensor is not None else None
             }
 
         def convert_to_zero_parameters(param_list):
             self._convert_to_zero_parameters(param_list)
 
         def allgather_before(func: Callable) -> Callable:
+
             def wrapped(*args, **kwargs):
                 param.all_gather()
                 return func(*args, **kwargs)
@@ -979,8 +1029,8 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         return param.ds_numel + self._padding_size(param)
 
     def _padding_size(self, param):
-        remainder = param.ds_numel % self.world_size
-        return (self.world_size - remainder) if remainder else 0
+        remainder = param.ds_numel % self.num_partitions
+        return (self.num_partitions - remainder) if remainder else 0
 
     def _partition_numel(self, param):
         return param.ds_tensor.ds_numel
@@ -1011,9 +1061,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         for param in param_list:
             if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
                 if async_op:
-                    handle = self._allgather_param(param,
-                                                   async_op=async_op,
-                                                   hierarchy=hierarchy)
+                    handle = self._allgather_param(param, async_op=async_op, hierarchy=hierarchy)
                     param.ds_status = ZeroParamStatus.INFLIGHT  # if async_op else ZeroParamStatus.AVAILABLE
                     handles.append(handle)
                 else:
@@ -1050,9 +1098,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         global reuse_buffers
         #print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}")
         if param.ds_status is ZeroParamStatus.AVAILABLE:
-            print_rank_0(
-                f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}",
-                force=False)
+            print_rank_0(f"Partitioning param id {param.ds_id} reuse buffers {reuse_buffers}", force=False)
             # if reuse_buffers and False:
             #     numel = buffer.numel()
             #     buffer = param.data.view(-1)
@@ -1068,24 +1114,19 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
                 #param.data = param.ds_tensor.data
 
-                see_memory_usage(
-                    f'Before partitioning param {param.ds_id} {param.shape}',
-                    force=False)
+                see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False)
                 # param.data does not store anything meaningful in partitioned state
                 free_param(param)
-                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
-                                 force=False)
+                see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False)
 
                 if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
-                    print_rank_0(
-                        f"Param {param.ds_id} partition released since it exists in nvme",
-                        force=False)
+                    print_rank_0(f"Param {param.ds_id} partition released since it exists in nvme", force=False)
                     param.nvme_swapper.remove_partition_and_release_buffers([param])
 
                 return
 
             tensor_size = self._aligned_size(param)
-            partition_size = tensor_size // self.world_size
+            partition_size = tensor_size // self.num_partitions
 
             if param.ds_tensor is None:
                 final_location = None
@@ -1093,23 +1134,22 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                         numel=partition_size):
                     final_location = OffloadDeviceEnum.nvme
                     buffer = self.param_swapper.get_buffer(param, partition_size)
-                    partitioned_tensor = torch.empty(0,
-                                                     dtype=param.dtype,
-                                                     device=buffer.device)
+                    partitioned_tensor = torch.empty(0, dtype=param.dtype, device=buffer.device)
                     partitioned_tensor.data = buffer.data
-                    print_rank_0(
-                        f"ID {param.ds_id} Initializing partition for the first time for nvme offload."
-                    )
+                    print_rank_0(f"ID {param.ds_id} Initializing partition for the first time for nvme offload.")
 
                 else:
-                    partitioned_tensor = torch.empty(
-                        partition_size,
-                        dtype=param.dtype,
-                        device=OffloadDeviceEnum.cpu if self.remote_device
-                        == OffloadDeviceEnum.nvme else self.remote_device)
-                    if self.pin_memory:
-                        partitioned_tensor = get_accelerator().pin_memory(
-                            partitioned_tensor)
+                    if param.ds_persist:
+                        device = self.local_device
+                    elif self.remote_device == OffloadDeviceEnum.nvme:
+                        device = OffloadDeviceEnum.cpu
+                    else:
+                        device = self.remote_device
+
+                    partitioned_tensor = torch.empty(partition_size, dtype=param.dtype, device=device)
+
+                    if device == OffloadDeviceEnum.cpu and self.pin_memory:
+                        partitioned_tensor = get_accelerator().pin_memory(partitioned_tensor)
 
                 partitioned_tensor.requires_grad = False
                 param.ds_tensor = partitioned_tensor
@@ -1117,7 +1157,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
                 param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
                 param.ds_tensor.final_location = final_location
 
-            start = partition_size * self.rank
+            start = partition_size * self.get_partition_rank()
             end = start + partition_size
 
             one_dim_param = param.contiguous().view(-1)
@@ -1135,13 +1175,8 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
                 if start < param.ds_numel:
                     elements_to_copy = param.ds_numel - start
-                    param.ds_tensor.narrow(0,
-                                           0,
-                                           elements_to_copy).copy_(
-                                               one_dim_param.narrow(
-                                                   0,
-                                                   start,
-                                                   elements_to_copy))
+                    param.ds_tensor.narrow(0, 0,
+                                           elements_to_copy).copy_(one_dim_param.narrow(0, start, elements_to_copy))
 
             #print(f"Remote device {self.remote_device}")
 
@@ -1151,23 +1186,16 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
             # param.data does not store anything meaningful in partitioned state
 
-            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}',
-                             force=False)
+            see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False)
             free_param(param)
-            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}',
-                             force=False)
+            see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False)
 
             if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
                 self.param_swapper.swap_out_and_release([param])
-                print_rank_0(
-                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
-                see_memory_usage(
-                    f"ID {param.ds_id} Offloaded to nvme offload and buffers released.",
-                    force=False)
+                print_rank_0(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.")
+                see_memory_usage(f"ID {param.ds_id} Offloaded to nvme offload and buffers released.", force=False)
 
-            print_rank_0(
-                f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}"
-            )
+            print_rank_0(f"ID {param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}")
 
     def _param_status(self, param):
         if param.ds_tensor is not None:
@@ -1183,7 +1211,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
         partition_size = param.ds_tensor.ds_numel
 
-        tensor_size = partition_size * self.world_size
+        tensor_size = partition_size * self.num_partitions
         aligned_param_size = self._aligned_size(param)
         assert tensor_size == aligned_param_size, f'param id {param.ds_id} aligned size {aligned_param_size} does not match tensor size {tensor_size}'
 
@@ -1194,9 +1222,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         see_memory_usage(
             f'Before allocate allgather param {debug_param2name_id_shape_status(param)} partition_size={partition_size} ',
             force=False)
-        flat_tensor = torch.zeros(aligned_param_size,
-                                  dtype=param.dtype,
-                                  device=param.device).view(-1)
+        flat_tensor = torch.zeros(aligned_param_size, dtype=param.dtype, device=param.device).view(-1)
         see_memory_usage(
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
@@ -1212,27 +1238,22 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         #                                                   param.ds_numel).view(param.ds_shape)
         #            param.data = replicated_tensor.data
         #            return None
-        if self.use_all_gather_base:
-            # try the _all_gather_base on PyTorch master branch
-            handle = dist.all_gather_base(flat_tensor,
-                                          param.ds_tensor.to(
-                                              get_accelerator().device_name()),
-                                          group=self.ds_process_group,
-                                          async_op=async_op)
+        if self.use_all_gather_into_tensor:
+            handle = dist.all_gather_into_tensor(flat_tensor,
+                                                 param.ds_tensor.to(get_accelerator().device_name()),
+                                                 group=self.get_partition_dp_group(param),
+                                                 async_op=async_op)
         else:
             partitions = []
-            for i in range(self.world_size):
-                partitions.append(
-                    flat_tensor.narrow(0,
-                                       partition_size * i,
-                                       partition_size))
+            for i in range(self.num_partitions):
+                partitions.append(flat_tensor.narrow(0, partition_size * i, partition_size))
 
-                if i == dist.get_rank(group=self.ds_process_group):
+                if i == dist.get_rank(group=self.get_partition_dp_group(param)):
                     partitions[i].data.copy_(param.ds_tensor.data, non_blocking=True)
 
             handle = dist.all_gather(partitions,
-                                     partitions[self.rank],
-                                     group=self.ds_process_group,
+                                     partitions[self.get_partition_rank()],
+                                     group=self.get_partition_dp_group(param),
                                      async_op=async_op)
 
         replicated_tensor = flat_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape)
@@ -1245,6 +1266,12 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         """
         if len(param_list) == 0:
             return
+
+        if self.num_partitions == 1:
+            handle = _no_gather_coalesced(param_list)
+            handle.wait()
+            return None
+
         # collect local tensors and partition sizes
         partition_sizes = []
         local_tensors = []
@@ -1255,42 +1282,34 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         # allocate memory for allgather params
         allgather_params = []
         for psize in partition_sizes:
-            tensor_size = psize * self.world_size
-            flat_tensor = torch.empty(tensor_size,
-                                      dtype=param_list[0].dtype,
-                                      device=self.local_device).view(-1)
+            tensor_size = psize * self.num_partitions
+            flat_tensor = torch.empty(tensor_size, dtype=param_list[0].dtype, device=self.local_device).view(-1)
             flat_tensor.requires_grad = False
             allgather_params.append(flat_tensor)
 
         # launch
         launch_handles = []
-        # backend = get_backend(self.ds_process_group)
-        # with _batch_p2p_manager(backend):
         for param_idx, param in enumerate(param_list):
             input_tensor = local_tensors[param_idx].view(-1)
 
-            if self.use_all_gather_base:
+            if self.use_all_gather_into_tensor:
                 # try the _all_gather_base from Pytorch master
-                h = dist.all_gather_base(allgather_params[param_idx],
-                                         input_tensor,
-                                         group=self.ds_process_group,
-                                         async_op=True)
+                h = dist.all_gather_into_tensor(allgather_params[param_idx],
+                                                input_tensor,
+                                                group=self.get_partition_dp_group(param),
+                                                async_op=True)
             else:
                 output_list = []
-                for i in range(self.world_size):
+                for i in range(self.num_partitions):
                     psize = partition_sizes[param_idx]
                     partition = allgather_params[param_idx].narrow(0, i * psize, psize)
                     output_list.append(partition)
                     if not get_accelerator().on_accelerator(partition):
                         logger.warning(
-                            f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}'
-                        )
-
-                # back to old all_gather function signature
-                h = dist.all_gather(output_list,
-                                    input_tensor,
-                                    group=self.ds_process_group,
-                                    async_op=True)
+                            f'param {param_idx}, partition {i} is not on CUDA, partition shape {partition.size()}')
+
+                # back to old all_gather function
+                h = dist.all_gather(output_list, input_tensor, group=self.get_partition_dp_group(param), async_op=True)
             launch_handles.append(h)
 
         # Wait ensures the operation is enqueued, but not necessarily complete.
@@ -1299,9 +1318,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         # assign to param.data (not copy)
         for i, param in enumerate(param_list):
             gathered_tensor = allgather_params[i]
-            param.data = gathered_tensor.narrow(0,
-                                                0,
-                                                param.ds_numel).view(param.ds_shape).data
+            param.data = gathered_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape).data
 
         # guarantee the communication to be completed
         get_accelerator().synchronize()
@@ -1314,42 +1331,36 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
         partition_size = sum([param.ds_tensor.ds_numel for param in param_list])
 
-        tensor_size = partition_size * self.world_size
-        flat_tensor = torch.empty(tensor_size,
-                                  dtype=param_list[0].dtype,
-                                  device=self.local_device)
+        tensor_size = partition_size * self.num_partitions
+        flat_tensor = torch.empty(tensor_size, dtype=param_list[0].dtype, device=self.local_device)
         flat_tensor.requires_grad = False
         partitions = []
-        for i in range(self.world_size):
+        for i in range(self.num_partitions):
             start = partition_size * i
 
             partitions.append(flat_tensor.narrow(0, start, partition_size))
 
-            if i == self.rank:
+            if i == self.get_partition_rank():
                 offset = 0
                 for param in param_list:
                     param_numel = param.ds_tensor.ds_numel
 
-                    partitions[i].narrow(0,
-                                         offset,
-                                         param_numel).copy_(param.ds_tensor.data)
+                    partitions[i].narrow(0, offset, param_numel).copy_(param.ds_tensor.data)
 
                     offset += param_numel
 
         dist.all_gather(partitions,
-                        partitions[self.rank],
-                        group=self.ds_process_group,
+                        partitions[self.get_partition_rank()],
+                        group=self.get_partition_dp_group(param),
                         async_op=False)
         param_offset = 0
 
         for param in param_list:
             param_partition_size = param.ds_tensor.ds_numel
             param_size = param.ds_numel
-            replicated_tensor = torch.empty(param.ds_shape,
-                                            dtype=param.dtype,
-                                            device=self.local_device)
+            replicated_tensor = torch.empty(param.ds_shape, dtype=param.dtype, device=self.local_device)
 
-            for i in range(self.world_size):
+            for i in range(self.num_partitions):
 
                 start = i * partition_size
 
@@ -1360,9 +1371,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
                     part_to_copy = partitions[i].narrow(0, param_offset, numel_to_copy)
 
-                    replicated_tensor.view(-1).narrow(0,
-                                                      param_start,
-                                                      numel_to_copy).copy_(part_to_copy)
+                    replicated_tensor.view(-1).narrow(0, param_start, numel_to_copy).copy_(part_to_copy)
             #param_offset += param.data.numel()
             param_offset += param.ds_tensor.ds_numel
 
@@ -1389,27 +1398,22 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             # For these ranks the output of reduce scatter is a separate buffer and needs
             # to be copied in
             partition_size = param.ds_tensor.ds_numel
-            start = self.rank * partition_size
+            start = self.get_partition_rank() * partition_size
             end = start + partition_size
             #print_rank_0("REduce scatter was executed for praam {param.ds_id}")
             if start < param.ds_numel and end > param.ds_numel:
                 elements = param.ds_numel - start
-                param.grad.view(-1).narrow(0,
-                                           start,
-                                           elements).copy_(
-                                               reduced_partition.narrow(0,
-                                                                        0,
-                                                                        elements))
+                param.grad.view(-1).narrow(0, start, elements).copy_(reduced_partition.narrow(0, 0, elements))
 
     def _reduce_scatter_gradient(self, param):
 
         partition_size = param.ds_tensor.ds_numel
         #output = torch.empty(partition_size, dtype=param.dtype, device=param.device)
 
-        total_size = partition_size * self.world_size
+        total_size = partition_size * self.num_partitions
         input_list = []
 
-        for i in range(self.world_size):
+        for i in range(self.num_partitions):
 
             start = i * partition_size
             end = start + partition_size
@@ -1418,25 +1422,18 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             if start < param.ds_numel and end <= param.ds_numel:
                 input = param.grad.view(-1).narrow(0, start, partition_size)
             else:
-                input = torch.zeros(partition_size,
-                                    dtype=param.dtype,
-                                    device=param.device)
+                input = torch.zeros(partition_size, dtype=param.dtype, device=param.device)
 
                 if start < param.ds_numel:
                     elements = param.ds_numel - start
-                    input.narrow(0,
-                                 0,
-                                 elements).copy_(
-                                     param.grad.view(-1).narrow(0,
-                                                                start,
-                                                                elements))
+                    input.narrow(0, 0, elements).copy_(param.grad.view(-1).narrow(0, start, elements))
             #print("after reduce scatter gradients")
             input_list.append(input)
 
-        rank = dist.get_rank(group=self.ds_process_group)
+        rank = dist.get_rank(group=self.get_partition_dp_group(param))
         handle = dist.reduce_scatter(input_list[rank],
                                      input_list,
-                                     group=self.ds_process_group,
+                                     group=self.get_partition_dp_group(param),
                                      async_op=True)
 
         return handle, input_list[rank]
@@ -1446,11 +1443,10 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             partition_buffers = [None] * len(param_list)
 
         for param, partition_buffer in zip(param_list, partition_buffers):
-            self._partition_gradient(param,
-                                     partition_buffer=partition_buffer,
-                                     accumulate=accumulate)
+            self._partition_gradient(param, partition_buffer=partition_buffer, accumulate=accumulate)
 
     def _partition_gradient(self, param, partition_buffer=None, accumulate=False):
+
         #import pdb;pdb.set_trace()
         # param.grad=None
         # param.grad.test()
@@ -1462,14 +1458,12 @@ class Init(InsertPostInitMethodToModuleSubClasses):
 
         if partition_buffer is None:
             assert not accumulate, "No buffer to accumulate to"
-            partition_buffer = torch.zeros(partition_size,
-                                           dtype=param.dtype,
-                                           device=param.device)
+            partition_buffer = torch.zeros(partition_size, dtype=param.dtype, device=param.device)
         else:
             assert partition_buffer.numel(
             ) >= partition_size, f"The partition buffer size {partition_buffer.numel()} should match the size of param.ds_tensor {partition_size}"
 
-        rank = dist.get_rank(group=self.ds_process_group)
+        rank = dist.get_rank(group=self.get_partition_dp_group(param))
         start = partition_size * rank
         end = start + partition_size
 
@@ -1496,9 +1490,7 @@ class Init(InsertPostInitMethodToModuleSubClasses):
             # when src is gpu and dest is cpu
             # adding directly to cpu is very slow
             else:
-                acc_tensor = torch.empty(src_tensor.numel(),
-                                         dtype=param.dtype,
-                                         device=param.device)
+                acc_tensor = torch.empty(src_tensor.numel(), dtype=param.dtype, device=param.device)
 
                 acc_tensor.copy_(dest_tensor)
                 acc_tensor.add_(src_tensor)
@@ -1515,8 +1507,25 @@ class Init(InsertPostInitMethodToModuleSubClasses):
         param.grad.data = dest_tensor_full_buffer.data
         see_memory_usage("After partitioning gradients", force=False)
 
+    def get_partition_dp_group(self, param):
+        return param.ds_process_group
+
+    def get_partition_rank(self):
+        """subclass can overload to specify different relative rank in
+        parameter partition group"""
+        return self.rank
+
+    @property
+    def num_partitions(self):
+        return self.dp_world_size
+
+    def get_dp_process_group(self):
+        """ Return the communication group with all data-parallel ranks """
+        return self.ds_process_group
+
 
 class GatheredParameters:
+
     def __init__(self, params, modifier_rank=None, fwd_module=None, enabled=True):
         """A context that collects parameters that were partitioned via a
         :class:`deepspeed.zero.Init` context. The parameters are partitioned
@@ -1627,8 +1636,7 @@ class GatheredParameters:
                 self.src_rank = modifier_rank
             else:
                 # A group was specified; convert DP rank to global rank
-                self.src_rank = dist.get_global_rank(self.params[0].ds_process_group,
-                                                     modifier_rank)
+                self.src_rank = dist.get_global_rank(self.params[0].ds_process_group, modifier_rank)
         self.fwd_module = fwd_module
         if self.fwd_module is not None:
             # is a no-op if already registered
@@ -1647,12 +1655,7 @@ class GatheredParameters:
             self.params[0].partition(param_list=self.params, has_been_updated=False)
             return
 
-        handles = [
-            dist.broadcast(p,
-                           self.src_rank,
-                           group=p.ds_process_group,
-                           async_op=True) for p in self.params
-        ]
+        handles = [dist.broadcast(p, self.src_rank, group=p.ds_process_group, async_op=True) for p in self.params]
         for h in handles:
             h.wait()
         self.params[0].partition(param_list=self.params, has_been_updated=True)
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index f6edfe626f0bdf1afa004fc34d798a9888598fd2..949c54f5e8067eac41cc4e89c724f76a0bb8724e 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from dataclasses import dataclass
 import collections
@@ -24,8 +24,7 @@ def debug_rank0(message: str) -> None:
 
 @instrument_w_nvtx
 def get_all_parameters(sub_module, recurse=False):
-    return itertools.chain(sub_module.named_parameters(recurse=recurse),
-                           sub_module.ds_external_parameters())
+    return itertools.chain(sub_module.named_parameters(recurse=recurse), sub_module.ds_external_parameters())
 
 
 def iter_params(module: Module, recurse=False) -> Iterable[Parameter]:
@@ -43,17 +42,15 @@ class ZeRoTraceMode(Enum):
 
 class PartitionedParameterCoordinator:
     """Handles partitioning and gathering of parameters."""
+
     class __InflightParamRegistry(UserDict):
         """registry for parameters in flight"""
-        def __setitem__(self,
-                        param: Parameter,
-                        handle: AllGatherCoalescedHandle) -> None:
+
+        def __setitem__(self, param: Parameter, handle: AllGatherCoalescedHandle) -> None:
             if param in self.data:
                 raise RuntimeError(f"{param.ds_summary()} already in registry")
             if param.ds_status != ZeroParamStatus.INFLIGHT:
-                raise RuntimeError(
-                    f"attempted to add non-inflight parameter to registry {param.ds_summary()}"
-                )
+                raise RuntimeError(f"attempted to add non-inflight parameter to registry {param.ds_summary()}")
             self.data[param] = handle
 
     @dataclass
@@ -78,10 +75,8 @@ class PartitionedParameterCoordinator:
         # sequence of submodules/parameters in forward pass + backward pass
         self.__submodule_order: Iterable[Module] = []
         self.__param_order: Iterable[__class__.__ParamInTrace] = []
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
-        self.__step_id_module_fetched_for = collections.defaultdict(
-            lambda: collections.deque())
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         # number of available params, and max number of available params
         self.__n_available_params: int = 0
         self.__max_n_available_params: int = max_available_parameters_in_numel
@@ -122,8 +117,7 @@ class PartitionedParameterCoordinator:
     def _clear_trace_structures(self) -> None:
         self.__submodule_order = []
         self.__param_order = []
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__param_queue = None
 
     def is_complete_trace(self) -> bool:
@@ -144,19 +138,26 @@ class PartitionedParameterCoordinator:
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
             # sub_module must match expectation else invalidate trace cache
+            if len(self.__submodule_order) <= self.__step_id:
+                print_rank_0(
+                    f"Invalidate trace cache @ step {self.__step_id} and module {sub_module.id}: "
+                    f"cache has only {len(self.__submodule_order)} modules",
+                    force=True)
+                self._invalidate_trace()
+                return
+
             if sub_module != self.__submodule_order[self.__step_id]:
                 expected_module_id = self.__submodule_order[self.__step_id].id
-                debug_rank0(
+                print_rank_0(
                     f"Invalidate trace cache @ step {self.__step_id}: "
-                    f"expected module {expected_module_id}, but got module {sub_module.id}"
-                )
+                    f"expected module {expected_module_id}, but got module {sub_module.id}",
+                    force=True)
                 self._invalidate_trace()
 
     def record_module(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
-            raise RuntimeError(
-                f"attempted to record trace when status = {self.__trace_mode}")
+            raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
         self.__submodule_order.append(sub_module)
         self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id)
@@ -164,14 +165,11 @@ class PartitionedParameterCoordinator:
     def record_parameters(self, sub_module: Module) -> None:
         """adds sub module to trace"""
         if not self.is_record_trace():
-            raise RuntimeError(
-                f"attempted to record trace when status = {self.__trace_mode}")
+            raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
         step_id = self.__step_id_module_fetched_for[sub_module.id].popleft()
         for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
-            self.__param_order.append(
-                __class__.__ParamInTrace(param=param,
-                                         step_id_last_used_at=step_id))
+            self.__param_order.append(__class__.__ParamInTrace(param=param, step_id_last_used_at=step_id))
 
     def construct_parameter_trace_from_module_trace(self):
         """use module trace to construct parameter trace"""
@@ -182,9 +180,8 @@ class PartitionedParameterCoordinator:
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
         if self.__inflight_param_registry:
-            raise RuntimeError(
-                f"still have inflight params "
-                f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}")
+            raise RuntimeError(f"still have inflight params "
+                               f"{[p.ds_summary for p in self.__inflight_param_registry.keys()]}")
 
         if not self.is_complete_trace():  # not self.trace_complete:
             # Make sure that recorded submodule orders are identical across ranks
@@ -194,26 +191,22 @@ class PartitionedParameterCoordinator:
                 # Successfully recorded a trace
                 self.construct_parameter_trace_from_module_trace()
                 # Make sure that recorded parameter orders are identical across ranks
-                assert_ints_same_as_other_ranks(
-                    [p.param.ds_id for p in self.__param_order])
-                assert_ints_same_as_other_ranks(
-                    [p.step_id_last_used_at for p in self.__param_order])
+                assert_ints_same_as_other_ranks([p.param.ds_id for p in self.__param_order])
+                assert_ints_same_as_other_ranks([p.step_id_last_used_at for p in self.__param_order])
 
                 self.__submodule_order = tuple(self.__submodule_order)  # freeze
                 self.__param_order = tuple(self.__param_order)  # freeze
                 self.__trace_mode = ZeRoTraceMode.COMPLETE
                 print_rank_0(
-                    f"completed record trace: {[m.id for m in self.__submodule_order]}",
+                    f"completed record trace of {len(self.__submodule_order)} sub modules: {[m.id for m in self.__submodule_order]}",
                     force=False)
             else:
                 # Enable trace recording for next forward/backward pass
                 self.__trace_mode = ZeRoTraceMode.RECORD
 
         self.__param_queue = collections.deque(self.__param_order)  # reset fetch queue
-        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(
-            lambda: int(-1e10))
-        self.__step_id_module_fetched_for = collections.defaultdict(
-            lambda: collections.deque())
+        self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
+        self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
         self.__n_available_params = 0
 
@@ -221,9 +214,7 @@ class PartitionedParameterCoordinator:
         if step_id is None:
             step_id = self.__step_id
         param_names = [debug_param2name_id(p) for p in params]
-        print(
-            f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}'
-        )
+        print(f'{tag} step = {step_id} mod = {debug_module2name_id(sub_module)} p_names = {param_names}')
 
     def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
         if step_id is None:
@@ -263,11 +254,9 @@ class PartitionedParameterCoordinator:
             debug_rank0(f"-wait: {param.ds_summary()}")
             if param in self.__inflight_param_registry:
                 with get_accelerator().stream(self.__allgather_stream):
-                    while self.__ongoing_fetch_events and self.__ongoing_fetch_events[
-                            0].query():
+                    while self.__ongoing_fetch_events and self.__ongoing_fetch_events[0].query():
                         self.__ongoing_fetch_events.popleft()
-                    if len(self.__ongoing_fetch_events
-                           ) > self.__max_ongoing_fetch_events:
+                    if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events:
                         self.__ongoing_fetch_events.popleft().synchronize()
 
                     self.__inflight_param_registry.pop(param).wait()
@@ -288,12 +277,8 @@ class PartitionedParameterCoordinator:
             # prefetches we won't look for them here
             discarded_from_prefetch_queue = set()
             params_not_already_fetched = set(
-                filter(
-                    lambda p: self.__most_recent_step_id_param_fetched_for[p] < self.
-                    __step_id,
-                    params_to_fetch))
-            while self.__param_queue and len(discarded_from_prefetch_queue) < len(
-                    params_not_already_fetched):
+                filter(lambda p: self.__most_recent_step_id_param_fetched_for[p] < self.__step_id, params_to_fetch))
+            while self.__param_queue and len(discarded_from_prefetch_queue) < len(params_not_already_fetched):
                 param_in_trace = self.__param_queue.popleft()
                 self.__most_recent_step_id_param_fetched_for[
                     param_in_trace.param] = param_in_trace.step_id_last_used_at
@@ -305,8 +290,7 @@ class PartitionedParameterCoordinator:
                     f"module id: {current_submodule.id}, training: {current_submodule.training}\n"
                     f"expected the next {len(params_not_already_fetched)} parameters in the "
                     f"parameter fetch queue to be {tuple(p.ds_summary(use_debug_name=True) for p in params_not_already_fetched)} \n"
-                    f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}."
-                )
+                    f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}.")
 
             def _is_currently_on_nvme(param):
                 if param.nvme_swapper is None:
@@ -317,14 +301,12 @@ class PartitionedParameterCoordinator:
 
             # kick off all gather for params in the next few submodules (prefetch)
             if self.__prefetch_bucket_sz > 0:
-                max_params_to_prefetch = min(
-                    self.__max_n_available_params - self.__n_available_params,
-                    self.__prefetch_bucket_sz)
+                max_params_to_prefetch = min(self.__max_n_available_params - self.__n_available_params,
+                                             self.__prefetch_bucket_sz)
                 params_to_prefetch = set()
                 numel_prefetching = 0
                 while self.__param_queue and numel_prefetching < max_params_to_prefetch:
-                    param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft(
-                    )
+                    param_in_trace: __class__.__ParamInTrace = self.__param_queue.popleft()
 
                     if _is_currently_on_nvme(param_in_trace.param):
                         # nvme prefetch is handled elsewhere. Need to break here to preserve fetch order
@@ -358,10 +340,8 @@ class PartitionedParameterCoordinator:
     def release_sub_module(self, submodule: Module) -> None:
         """release the parameters of a sub module, assuming they meet conditions to
         be released."""
-        params_to_release = (self.__params_to_release(submodule,
-                                                      self.__step_id)
-                             if self.is_complete_trace() else set(
-                                 p.ds_id for p in iter_params(submodule)))
+        params_to_release = (self.__params_to_release(submodule, self.__step_id) if self.is_complete_trace() else set(
+            p.ds_id for p in iter_params(submodule)))
         for param in iter_params(submodule):
             param.ds_active_sub_modules.discard(submodule.id)
             if param.ds_id in params_to_release and not param.is_external_param:
@@ -404,13 +384,10 @@ class PartitionedParameterCoordinator:
 
             # Release swap buffers for persisted params on nvme since they will never be partitioned or evicted from GPU
             swap_persisted_params = [
-                p for p in partitioned_params
-                if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme
+                p for p in partitioned_params if p.ds_persist and p.ds_tensor.final_location == OffloadDeviceEnum.nvme
             ]
             if swap_persisted_params:
-                swap_persisted_params[
-                    0].nvme_swapper.remove_partition_and_release_buffers(
-                        swap_persisted_params)
+                swap_persisted_params[0].nvme_swapper.remove_partition_and_release_buffers(swap_persisted_params)
 
     @instrument_w_nvtx
     def __release_param(self, param: Parameter) -> None:
@@ -421,14 +398,11 @@ class PartitionedParameterCoordinator:
 
     @instrument_w_nvtx
     @functools.lru_cache(maxsize=None)
-    def __params_to_release(self,
-                            submodule_to_release: Module,
-                            step_id: int) -> Set[int]:
+    def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set[int]:
         if not self.is_complete_trace():
             raise RuntimeError("expected trace to be complete")
 
-        params_to_release = set(p.ds_id for p in iter_params(submodule_to_release)
-                                if not p.ds_persist)
+        params_to_release = set(p.ds_id for p in iter_params(submodule_to_release) if not p.ds_persist)
 
         # Problem: When prefetcher scans the param trace, it skips AVAILABLE params.
         # This creates issues if those params are released before the skipped uses:
@@ -470,8 +444,8 @@ class PartitionedParameterCoordinator:
             param = param_in_trace.param
             if param.nvme_swapper is None:
                 continue
-            if (numel_considered > 2 * numel_in_flight or len(swap_in_params) >=
-                    param.nvme_swapper.available_swap_in_buffers()):
+            if (numel_considered > 2 * numel_in_flight
+                    or len(swap_in_params) >= param.nvme_swapper.available_swap_in_buffers()):
                 break
             if param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE:
                 swap_in_params.append(param)
diff --git a/deepspeed/runtime/zero/stage1.py b/deepspeed/runtime/zero/stage1.py
deleted file mode 100755
index 7cd37f904faa1c2ea0ae3074fcccabb5c28a0a6d..0000000000000000000000000000000000000000
--- a/deepspeed/runtime/zero/stage1.py
+++ /dev/null
@@ -1,1121 +0,0 @@
-import math
-import torch
-import torch.distributed as dist
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from collections import defaultdict
-
-from deepspeed.runtime.zero.utils import _initialize_parameter_parallel_groups
-from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
-from deepspeed.runtime.utils import get_grad_norm, CheckOverflow
-from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_OPTIMIZER_STATES
-from deepspeed.utils import logger, log_dist
-
-
-def get_alignment_padding(flattened_lean_size, sub_partition_id, sub_partition_size):
-    sub_partition_high_limit = (sub_partition_id + 1) * sub_partition_size
-    if sub_partition_high_limit <= flattened_lean_size:
-        return 0
-    else:
-        return min(sub_partition_size, sub_partition_high_limit - flattened_lean_size)
-
-
-def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_count):
-    group_paddings = []
-    flattened_size = sum([tensor.numel() for tensor in tensor_list])
-    for i in range(sub_partition_count):
-        padding = get_alignment_padding(flattened_size, i, sub_partition_size)
-        group_paddings.append(padding)
-
-    return group_paddings
-
-
-def flatten_dense_tensors_sub_partition_aligned(tensor_list,
-                                                dp,
-                                                max_elements_per_comm,
-                                                pg):
-    assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"
-
-    num_elements = sum(t.numel() for t in tensor_list)
-    log_dist("Total number of elements in model: {}, max elements per com: {}".format(
-        num_elements,
-        max_elements_per_comm),
-             ranks=[0])
-
-    # Compute aligned partition size based on parameter count
-    aligned_param_partition_size = math.ceil(num_elements / dp)
-
-    # Compute aligned partition size based on communication size
-    aligned_comm_partition_size = int(max_elements_per_comm // dp)
-
-    if aligned_param_partition_size <= aligned_comm_partition_size:
-        sub_partition_count = 1
-        sub_partition_size = aligned_param_partition_size
-    else:
-        sub_partition_count = math.ceil(aligned_param_partition_size /
-                                        aligned_comm_partition_size)
-        sub_partition_size = aligned_comm_partition_size
-
-    # Compute required padding  for alignment to dp and max_elements_per_comm
-    padding = (sub_partition_count * sub_partition_size * dp) - num_elements
-
-    log_dist(
-        f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
-        ranks=[0])
-    log_dist(
-        f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
-        ranks=[0])
-
-    if padding == 0:
-        aligned_tensor_list = tensor_list
-    else:
-        pad_tensor = torch.zeros(padding,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        aligned_tensor_list = tensor_list + [pad_tensor]
-
-    flat_tensors = _flatten_dense_tensors(aligned_tensor_list)
-    return flat_tensors
-
-
-def _single_range_check(current_index, start_index, end_index, tensor_size):
-    offset = 0
-    if (current_index >= start_index) and (current_index < end_index):
-        # Fully inside bounds
-        return True, offset
-    elif (start_index > current_index) and (start_index < (current_index + tensor_size)):
-        # Partially contained, compute offset
-        offset = start_index - current_index
-        return True, offset
-    else:
-        return False, offset
-
-
-def _range_check(current_index, element_intervals, tensor_size):
-    results = []
-    for comm_idx, interval in enumerate(element_intervals):
-        start_index, end_index = interval
-        contained, offset = _single_range_check(current_index, start_index, end_index, tensor_size)
-        if contained:
-            results.append((contained, offset, comm_idx))
-    if len(results) == 0:
-        return [(False, 0, -1)]
-    return results
-
-
-class FP16_DeepSpeedZeroOptimizer_Stage1(object):
-    """
-    FP16_DeepSpeedZeroOptimizer_Stage1 designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    This version aligns with stage-1 in the paper above.
-    """
-    def __init__(self,
-                 init_optimizer,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True,
-                 dp_process_group=None,
-                 partition_size=None,
-                 mpu=None,
-                 all_gather_partitions=True,
-                 allgather_size=500000000,
-                 clip_grad=0.0,
-                 max_elements_per_comm=5e8,
-                 elastic_checkpoint=True):
-
-        if dp_process_group is not None and partition_size is not None:
-            raise ValueError("Cannot specify both dp_process_group "
-                             "and partition size")
-
-        if dp_process_group is None:
-            dp_process_group = _initialize_parameter_parallel_groups(partition_size)
-
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        self.verbose = verbose
-        self.dp_process_group = dp_process_group
-
-        # TODO: automatically turn off if #params > some_limit
-        self.all_gather_partitions = all_gather_partitions
-        self.allgather_size = allgather_size
-
-        # self.max_elements_per_comm = max_elements_per_comm
-        # logger.info("max_elements_per_comm={}".format(max_elements_per_comm))
-
-        self.elastic_checkpoint = elastic_checkpoint
-        logger.info(f'ZeRO Elastic Checkpoint = {elastic_checkpoint}')
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-
-        # Setup bookkeeping data structures depending on partitioning type
-
-        # parallel_sub_partitioned_fp16_groups[group-idx] -> [comm-ids] -> [rank-ids]
-        self.parallel_sub_partitioned_fp16_groups = []
-        # same underlying data as above but viewed as: [groups] -> [rank-ids] -> [comm-ids]
-        self.parallel_comm_sub_partitioned_fp16_groups = []
-
-        # 32-bit sub-partitions of the parallel partitioned parameters
-        # that this process will update
-        self.local_sub_partitions_of_fp32_groups = []
-
-        # param partition info
-
-        # parameters in each group that will not be updated by this process directly
-        self.params_not_local = []
-
-        # parameters that will be updated by this process directly
-        self.params_in_rank_sub_partitions = []
-
-        # parameter offsets for parameters in sub-partitions. Parameter
-        # boundaries may not align with sub-partition boundaries
-        # so we need to keep track of the offsets
-        self.params_in_rank_sub_partitions_offsets = []
-
-        # number of elements per sub-partition in each group
-        self.sub_partition_sizes = []
-
-        # number of communication intervals for each group
-        self.num_comm_intervals_per_group = []
-
-        local_rank = dist.get_rank(group=self.dp_process_group)
-
-        self.group_paddings = []
-        self.partition_count = dist.get_world_size(group=self.dp_process_group)
-
-        self.default_device = self.optimizer.param_groups[0]['params'][0].device
-
-        # max elems per param group
-        self.max_elems_per_comm = []
-
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            # push this group to list before modify
-            self.fp16_groups.append(param_group['params'])
-
-            # calculate best max elements per comm based to minimize padding
-            self.max_elems_per_comm.append(
-                self.best_max_elems_per_comm(
-                    num_elements=sum(t.numel() for t in self.fp16_groups[i]),
-                    max_elements_per_comm=max_elements_per_comm,
-                    dp=dist.get_world_size(group=self.dp_process_group)))
-
-            # flattens all tensors into single 1d tensor aligned with sub-partition size for later dividing
-            # RS: create aligned sub-partitions
-            flat_aligned_params = flatten_dense_tensors_sub_partition_aligned(
-                tensor_list=self.fp16_groups[i],
-                dp=dist.get_world_size(group=self.dp_process_group),
-                max_elements_per_comm=self.max_elems_per_comm[i],
-                pg=self.dp_process_group)
-            self.fp16_groups_flat.append(flat_aligned_params)
-
-            # TODO: I don't think this does anything?
-            # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-            # divide the flat weights into near equal partition equal to the data parallel degree
-            # each process will compute on a different part of the partition
-            # RS: split into two layer list -> [comm-id] -> [sub-partitions per rank]
-            comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
-                self.get_data_parallel_sub_partitions(
-                    tensor=self.fp16_groups_flat[i],
-                    max_elements_per_comm=self.max_elems_per_comm[i],
-                    world_size=dist.get_world_size(
-                        group=self.dp_process_group),
-                    dp_process_group=self.dp_process_group
-                )
-            self.parallel_comm_sub_partitioned_fp16_groups.append(
-                comm_partitions)  # comm -> rank
-            self.parallel_sub_partitioned_fp16_groups.append(
-                dp_sub_partitions)  # rank -> comm
-            self.sub_partition_sizes.append(sub_partition_size)
-            self.num_comm_intervals_per_group.append(num_comm_intervals)
-            # data_parallel_partitions = self.get_data_parallel_partitions(self.fp16_groups_flat[i])
-            # self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
-
-            # a partition of the fp32 master weights that will be updated by this process
-            # RS: store/detach/cast our local sub-partitions
-            local_sub_partitions = []
-            for sub_partition in self.parallel_sub_partitioned_fp16_groups[i][
-                    local_rank]:
-                fp32_sub_partition = sub_partition.clone().float().detach()
-                fp32_sub_partition.requires_grad = True
-                local_sub_partitions.append(fp32_sub_partition)
-            self.local_sub_partitions_of_fp32_groups.append(local_sub_partitions)
-
-            # Compute sub_partition paddings
-            sub_partition_paddings = get_group_alignment_padding(
-                tensor_list=self.fp16_groups[i],
-                sub_partition_size=sub_partition_size,
-                sub_partition_count=num_comm_intervals * self.partition_count)
-            self.group_paddings.append(sub_partition_paddings)
-
-            # modify optimizer of have flat master weight
-            # self.single_partition_of_fp32_groups[i].requires_grad = True # keep this in case internal optimizer uses it
-            param_group['params'] = self.local_sub_partitions_of_fp32_groups[i]
-
-            # RS: divide up the sub-partitions and keep track of offsets for each param
-            # partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(group=self.dp_process_group)
-            params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local = self.get_all_sub_partition_info(
-                tensor_list=self.fp16_groups[i],
-                all_element_intervals=element_intervals,
-                local_rank=local_rank,
-                world_size=dist.get_world_size(group=self.dp_process_group)
-            )
-
-            self.params_in_rank_sub_partitions.append(params_in_rank_sub_partition)
-            self.params_not_local.append(params_not_local)
-            self.params_in_rank_sub_partitions_offsets.append(
-                params_in_rank_sub_partitions_offsets)
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
-        self.mpu = mpu
-        self.clip_grad = clip_grad
-
-        self.overflow = False
-        self.overflow_checker = CheckOverflow(self.fp16_groups,
-                                              mpu=self.mpu,
-                                              zero_reduce_scatter=True)
-
-        self._initialize_optimizer_states()
-
-    def _initialize_optimizer_states(self):
-        for group_idx, group in enumerate(self.local_sub_partitions_of_fp32_groups):
-            for idx, sub_partition_param in enumerate(group):
-                sub_partition_grad = torch.zeros(int(
-                    self.sub_partition_sizes[group_idx]),
-                                                 dtype=sub_partition_param.dtype).cuda()
-                sub_partition_param.grad = sub_partition_grad
-
-        self.optimizer.step()
-
-        for group in self.local_sub_partitions_of_fp32_groups:
-            for idx, sub_partition_param in enumerate(group):
-                sub_partition_param.grad = None
-
-    @staticmethod
-    def best_max_elems_per_comm(num_elements, max_elements_per_comm, dp):
-        # if we use max-elems-per-comm as is, how many comm intervals will there be
-        max_comm_intervals = math.ceil(num_elements / max_elements_per_comm)
-        padding_for_max_comm = (max_elements_per_comm *
-                                max_comm_intervals) - num_elements
-
-        # if we use 1 less comm interval how much extra comm padding would be required
-        min_comm_intervals = num_elements // max_elements_per_comm
-        if min_comm_intervals == 0:
-            log_dist(f'Using default max_elements_per_comm {max_elements_per_comm}',
-                     ranks=[0])
-            return max_elements_per_comm
-
-        padding_for_min_comm = math.ceil(num_elements / (dp * min_comm_intervals))
-
-        # choose padding that uses least amount of overhead
-        if padding_for_max_comm > padding_for_min_comm:
-            new_max_elements_per_comm = padding_for_min_comm + max_elements_per_comm
-            log_dist(
-                f'Updating max_elements_per_comm from {max_elements_per_comm} -> {new_max_elements_per_comm}',
-                ranks=[0])
-            return new_max_elements_per_comm
-        else:
-            log_dist(f'Using default max_elements_per_comm {max_elements_per_comm}',
-                     ranks=[0])
-            return max_elements_per_comm
-
-    @staticmethod
-    def get_data_parallel_sub_partitions(tensor,
-                                         max_elements_per_comm,
-                                         world_size,
-                                         dp_process_group=None):
-        total_num_elements = tensor.numel()
-
-        # if total elements is less than our max, revert to splitting into dp partitions
-        max_elements_per_comm = min(total_num_elements, max_elements_per_comm)
-        sub_partition_size = int(max_elements_per_comm // world_size)
-
-        # Ensure partition alignment was done correctly
-        num_sub_partitions = int(total_num_elements // sub_partition_size)
-        assert total_num_elements % sub_partition_size == 0, "{} % {} != 0".format(total_num_elements, sub_partition_size)
-
-        # Ensure comm interval alignment was done correctly.
-        num_comm_intervals = int(num_sub_partitions // world_size)
-        assert num_sub_partitions % world_size == 0, "{} % {} != 0".format(num_sub_partitions, world_size)
-
-        if not dist.is_initialized() or dist.get_rank(group=dp_process_group) == 0:
-            logger.info("**** partition info:")
-            logger.info("\t total_num_elements=%s", total_num_elements)
-            logger.info("\t world_size=%s", world_size)
-            logger.info("\t max_elements_per_comm=%s", max_elements_per_comm)
-            logger.info("\t sub_partition_size=%s", sub_partition_size)
-            logger.info("\t num_sub_partitions=%s", num_sub_partitions)
-            logger.info("\t num_comm_intervals=%s", num_comm_intervals)
-            logger.info("****")
-
-        # [comm_id] -> [rank]
-        comm_partitions = []
-        for _ in range(num_comm_intervals):
-            comm_partitions.append([])
-
-        start = 0
-        comm_id = 0
-        element_intervals = defaultdict(
-            list)  # [rank] -> [(start,end), (start,end), ...]
-        for idx in range(num_sub_partitions):
-            rank_id = idx % world_size
-            sub_partition = tensor.narrow(0, start, sub_partition_size).detach()
-            element_intervals[rank_id].append((start, start + sub_partition_size))
-            comm_partitions[comm_id].append(sub_partition)
-            start = start + sub_partition_size
-            if rank_id == (world_size - 1):
-                comm_id += 1
-
-        # [rank] -> [comm_id]
-        sub_partitions = []
-        for _ in range(world_size):
-            sub_partitions.append([])
-        for comm_id, partitions in enumerate(comm_partitions):
-            for rank_id, partition in enumerate(partitions):
-                sub_partitions[rank_id].append(partition)
-
-        return comm_partitions, sub_partitions, element_intervals, sub_partition_size, num_comm_intervals
-
-    @staticmethod
-    def get_all_sub_partition_info(tensor_list,
-                                   all_element_intervals,
-                                   local_rank,
-                                   world_size):
-        params_not_local = []
-
-        # [rank] -> [comm-id] -> [param/offset]
-        params_in_rank_sub_partition = []
-        params_in_rank_sub_partitions_offsets = []
-
-        for rank in range(world_size):
-            params_in_local_sub_partition = []
-            local_sub_partition_offsets = []
-            comm_tensor_list = []
-            comm_offset_list = []
-            current_index = 0
-            prev_comm_idx = 0
-            for iii, tensor in enumerate(tensor_list):
-                tensor_size = tensor.numel()
-                #if local_rank == 0:
-                #    # logger.info("rank={}, current_index={}, tensor_size={}, tensor-idx={}".format(rank,
-                #        current_index, tensor_size, iii))
-                results_list = _range_check(current_index,
-                                            all_element_intervals[rank],
-                                            tensor_size)
-                for contained, offset, comm_idx in results_list:
-                    #if local_rank == 0:
-                    #    logger.info("rank={}, contained={}, offset={}, comm_idx={}".format(rank, contained,
-                    #        offset, comm_idx))
-                    if contained:
-                        if prev_comm_idx != comm_idx:
-                            params_in_local_sub_partition.append(comm_tensor_list)
-                            comm_tensor_list = []
-                            local_sub_partition_offsets.append(comm_offset_list)
-                            comm_offset_list = []
-                        comm_tensor_list.append(tensor)
-                        comm_offset_list.append(offset)
-                        prev_comm_idx = comm_idx
-                    elif rank == local_rank:
-                        params_not_local.append(tensor)
-
-                current_index = current_index + tensor_size
-
-            #assert len(comm_tensor_list) > 0
-            #assert len(comm_offset_list) > 0
-            params_in_local_sub_partition.append(comm_tensor_list)
-            local_sub_partition_offsets.append(comm_offset_list)
-
-            params_in_rank_sub_partition.append(params_in_local_sub_partition)
-            params_in_rank_sub_partitions_offsets.append(local_sub_partition_offsets)
-
-        return params_in_rank_sub_partition, params_in_rank_sub_partitions_offsets, params_not_local
-
-    @staticmethod
-    def get_flat_sub_partitions(comm_tensor_list,
-                                comm_param_offsets,
-                                sub_partition_size,
-                                dtype,
-                                default_device,
-                                num_comm_intervals=None,
-                                return_partition_params=False):
-
-        partition_params = []
-        final_param_offsets = []
-        flat_sub_partitions = []
-        for tensor_list, param_offsets in zip(comm_tensor_list, comm_param_offsets):
-            flat_tensor_list = []
-            current_size = 0
-            my_offsets = []
-            my_params = []
-
-            for i, tensor in enumerate(tensor_list):
-                if tensor.grad is None:
-                    tensor.grad = torch.zeros(tensor.size(),
-                                              dtype=tensor.dtype,
-                                              device=tensor.device)
-                param = tensor
-                tensor = tensor.grad
-                num_elements = tensor.numel()
-                tensor_offset = 0
-
-                #we need to offset to get to the right element
-                if i == 0 and param_offsets[i] > 0:
-                    tensor_offset = param_offsets[i]
-                    num_elements = num_elements - tensor_offset
-
-                # We don't need all elements of the tensor if this tensor is
-                # larger than we have space for in our curr sub-partition
-                if num_elements > (sub_partition_size - current_size):
-                    num_elements = sub_partition_size - current_size
-
-                #we need a narrow view of the tensor based on the tensor offset and number of elements that
-                #we need from this tensor
-                if tensor_offset > 0 or num_elements < tensor.numel():
-                    flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                        0,
-                        int(tensor_offset),
-                        int(num_elements)).to(dtype))
-                else:
-                    flat_tensor_list.append(tensor.to(dtype))
-                my_params.append(param)
-
-                #remember offset into partition and #elems for this tensor
-                my_offsets.append((current_size, num_elements))
-
-                current_size = current_size + num_elements
-
-            #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-            if current_size < sub_partition_size:
-                my_offsets.append((None, None))
-                my_params.append(None)
-                if len(tensor_list) == 0:
-                    assert default_device != None
-                    flat_tensor_list.append(
-                        torch.zeros(int(sub_partition_size - current_size),
-                                    dtype=dtype,
-                                    device=default_device))
-                else:
-                    flat_tensor_list.append(
-                        torch.zeros(int(sub_partition_size - current_size),
-                                    dtype=dtype,
-                                    device=tensor_list[0].device))
-            partition_params.append(my_params)  #flat_tensor_list)
-            final_param_offsets.append(my_offsets)
-            assert len(flat_tensor_list) == len(my_offsets), "{} {}".format(len(flat_tensor_list), len(my_offsets))
-            flat_sub_partitions.append(_flatten_dense_tensors(flat_tensor_list))
-        if num_comm_intervals is not None and len(
-                flat_sub_partitions) < num_comm_intervals:
-            # logger.info("padding w. sub partitions to ensure uniform communication")
-            device = flat_sub_partitions[0].device
-            for _ in range(num_comm_intervals - len(flat_sub_partitions)):
-                flat_sub_partitions.append(
-                    torch.zeros(int(sub_partition_size),
-                                dtype=dtype,
-                                device=device))
-                partition_params.append([None])
-                final_param_offsets.append([(None, None)])
-
-        if return_partition_params:
-            assert len(flat_sub_partitions) == len(partition_params)
-            assert len(partition_params) == len(final_param_offsets), "{} {}".format(len(partition_params), len(final_param_offsets))
-            return flat_sub_partitions, partition_params, final_param_offsets
-        return flat_sub_partitions
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            if isinstance(p, list):
-                for _p in p:
-                    _p.grad = None
-            else:
-                p.grad = None
-
-    def reduce_scatter_gradients(self,
-                                 postscale_gradients,
-                                 gradient_predivide_factor,
-                                 gradient_average):
-        world_size = dist.get_world_size(group=self.dp_process_group)
-        local_rank = dist.get_rank(group=self.dp_process_group)
-
-        for i, group in enumerate(self.fp16_groups):
-            num_comm_intervals = self.num_comm_intervals_per_group[i]
-            all_sub_partitions = []
-            for rank in range(world_size):
-                # gsp is list of partitions indexed by comm_idx
-                grad_sub_partitions = self.get_flat_sub_partitions(
-                    comm_tensor_list=self.params_in_rank_sub_partitions[i][rank],
-                    comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i]
-                    [rank],
-                    dtype=torch.half,
-                    default_device=self.default_device,
-                    sub_partition_size=self.sub_partition_sizes[i],
-                    num_comm_intervals=self.num_comm_intervals_per_group[i])
-                all_sub_partitions.append(grad_sub_partitions)
-
-                assert len(grad_sub_partitions) == num_comm_intervals
-
-            local_comm_partitions = []
-            for comm_idx in range(num_comm_intervals):
-                single_comm_all_partitions = []
-                for rank in range(world_size):
-                    single_comm_all_partitions.append(all_sub_partitions[rank][comm_idx])
-
-                if postscale_gradients:
-                    if gradient_predivide_factor != 1.0:
-                        for partition in single_comm_all_partitions:
-                            partition.mul_(1. / gradient_predivide_factor)
-
-                    dist.reduce_scatter(output=single_comm_all_partitions[local_rank],
-                                        input_list=single_comm_all_partitions,
-                                        group=self.dp_process_group)
-
-                    if gradient_average:
-                        # Only need to average our local grads in post scaling
-                        if gradient_predivide_factor != world_size:
-                            single_comm_all_partitions[local_rank].mul_(
-                                gradient_predivide_factor / world_size)
-                else:
-                    for partition in single_comm_all_partitions:
-                        partition.div_(world_size)
-
-                    dist.reduce_scatter(output=single_comm_all_partitions[local_rank],
-                                        input_list=single_comm_all_partitions,
-                                        group=self.dp_process_group)
-
-    def step(self, closure=None):
-        # First compute norm for all group so we know if there is overflow
-        self.overflow = self.overflow_checker.check()
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            self.zero_grad()
-            if self.verbose:
-                logger.info(
-                    "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
-                    "scale: {}, reducing to {}".format(prev_scale,
-                                                       self.loss_scale))
-            return self.overflow
-
-        norm_groups = []
-        local_sub_partitions_grad_groups = []
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for i, group in enumerate(self.fp16_groups):
-            #TODO RS: update get grad norm to support sub partitions
-            norm_groups.append(get_grad_norm(group, mpu=self.mpu))
-
-            #RS: update free grads w.r.t. sub partitions
-            #free gradients for all the parameters that are not updated by this process
-            self.free_grad_in_param_list(self.params_not_local[i])
-
-            # create flat gradient partitions for parameters updated by this process
-            local_grad_sub_partitions = self.get_flat_sub_partitions(
-                comm_tensor_list=self.params_in_rank_sub_partitions[i][partition_id],
-                comm_param_offsets=self.params_in_rank_sub_partitions_offsets[i]
-                [partition_id],
-                sub_partition_size=self.sub_partition_sizes[i],
-                dtype=self.local_sub_partitions_of_fp32_groups[i][0].dtype,
-                num_comm_intervals=self.num_comm_intervals_per_group[i],
-                default_device=self.default_device)
-
-            #RS: update all our local params with sub-partition grads
-            for idx, sub_partition_param in enumerate(self.local_sub_partitions_of_fp32_groups[i]):
-                sub_partition_param.grad = local_grad_sub_partitions[idx]
-
-            #RS: update free grads for sub-partitions
-            #release all the gradient since we have already created a necessary copy in dp_grad_partition
-            self.free_grad_in_param_list(
-                self.params_in_rank_sub_partitions[i][partition_id])
-
-            local_sub_partitions_grad_groups.append(local_grad_sub_partitions)
-
-        #RS: update unscale/clip with sub partitions
-        self.unscale_and_clip_grads(local_sub_partitions_grad_groups, norm_groups)
-
-        self.optimizer.step()
-
-        #RS: clear our sub partition grads
-        #get rid of the fp32 gradients. Not needed anymore
-        for group in self.local_sub_partitions_of_fp32_groups:
-            for idx, sub_partition_param in enumerate(group):
-                sub_partition_param.grad = None
-            #group.grad = None
-
-        #NOTE RS: removed norm_groups outer loop from original code, i don't think it's needed
-        #RS: copy all sub-partition fp32 data to fp16 sub partitions
-        # copy fp32 param data to fp16 partitions w.r.t. our local rank
-        for fp16_all_sub_partitions, fp32_local_sub_partitions in zip(self.parallel_sub_partitioned_fp16_groups, self.local_sub_partitions_of_fp32_groups):
-            for local_sub_partition_param_fp16, local_sub_partition_param_fp32 in zip(fp16_all_sub_partitions[partition_id], fp32_local_sub_partitions):
-                local_sub_partition_param_fp16.data.copy_(
-                    local_sub_partition_param_fp32.data)
-
-        #RS: all_gather/broadcast sub-partitions in separate comm calls
-        #gather the updated weights from everyone
-        for fp16_all_sub_partitions in self.parallel_comm_sub_partitioned_fp16_groups:
-            for comm_id, sub_partitions in enumerate(fp16_all_sub_partitions):
-                dist.all_gather(sub_partitions,
-                                sub_partitions[partition_id],
-                                group=self.dp_process_group)
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-        return self.overflow
-
-    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm**2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def backward(self, loss, retain_graph=False):
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Return communication interval paddings for local rank and group
-    def _get_local_group_paddings(self, group_index):
-        local_rank = dist.get_rank(group=self.dp_process_group)
-        sub_partition_indices = [
-            local_rank + (comm_idx * self.partition_count)
-            for comm_idx in range(self.num_comm_intervals_per_group[group_index])
-        ]
-        group_paddings = [
-            self.group_paddings[group_index][sub_idx]
-            for sub_idx in sub_partition_indices
-        ]
-        return group_paddings
-
-    # Return group tensor after removing paddings that are added for alignment to DP world size.
-    # This method works on the assumption that each group contains sub partitions.
-    def _get_groups_without_padding(self, groups_with_padding):
-        groups_without_padding = []
-
-        for group_index, group in enumerate(groups_with_padding):
-            group_paddings = self._get_local_group_paddings(group_index)
-
-            lean_sub_partitions = []
-            for sub_partition, padding in zip(group, group_paddings):
-                lean_length = sub_partition.numel() - padding
-                lean_sub_partitions.append(sub_partition[:lean_length])
-            groups_without_padding.append(lean_sub_partitions)
-
-        return groups_without_padding
-
-    # Return optimizer state after removing paddings that are added for alignment.
-    def _get_state_without_padding(self, state_with_padding, padding):
-        lean_state = {}
-        for key, value in state_with_padding.items():
-            if torch.is_tensor(value):
-                lean_length = value.numel() - padding
-                lean_state[key] = value[:lean_length]
-            else:
-                lean_state[key] = value
-
-        return lean_state
-
-    # Return base optimizer states.
-    # This method assumes that each param group contains a single flattened tensor.
-    def _get_base_optimizer_state(self):
-        optimizer_groups_state = []
-
-        for group_index, group in enumerate(self.optimizer.param_groups):
-            param_paddings = self._get_local_group_paddings(group_index)
-
-            group_lean_state = []
-            for param_idx, param in enumerate(group['params']):
-                lean_state = self._get_state_without_padding(self.optimizer.state[param],
-                                                             param_paddings[param_idx])
-                group_lean_state.append(lean_state)
-
-            optimizer_groups_state.append(group_lean_state)
-
-        return optimizer_groups_state
-
-    def _rigid_state_dict(self):
-        """
-            Returns a dict that can be loaded for continued training with same DP degree
-        """
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self.optimizer.state_dict()
-        state_dict[
-            'local_sub_partitions_of_fp32_groups'] = self.local_sub_partitions_of_fp32_groups
-        return state_dict
-
-    def _elastic_state_dict(self):
-        """
-            Returns a dict that can be loaded for elastic training with different DP degree
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
-
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_OPTIMIZER_STATES
-        state_dict['partition_count'] = self.partition_count
-        state_dict['num_comm_intervals_per_group'] = self.num_comm_intervals_per_group
-
-        # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.local_sub_partitions_of_fp32_groups)
-        state_dict['local_sub_partitions_of_fp32_groups'] = fp32_groups_without_padding
-
-        return state_dict
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        if self.elastic_checkpoint:
-            return self._elastic_state_dict()
-
-        return self._rigid_state_dict()
-
-    # Extract the fp32 weights of the current rank from checkpoint by merging the
-    # sub partitions of communication intervals across ranks.
-    # Let sub_i_j = sub partition of rank i and comm interval j
-    # For 2 ranks and 2 comm intervals, checkpoints (minus padding) are as follows:
-    # rank 0 = [sub_0_0, sub_0_1]
-    # rank 1 = [sub_1_0, sub_1_1]
-    # Merge to get [sub_0_0, sub_1_0, sub_0_1, sub_1_1] => original un-padded flattened tensor.
-    def _retrieve_group_sub_partition_weights(self,
-                                              all_partition_fp32_weights,
-                                              max_elems_per_comm):
-        num_partitions = len(all_partition_fp32_weights)
-        num_comm_intervals = len(all_partition_fp32_weights[0])
-        num_sub_partitions = num_partitions * num_comm_intervals
-        all_sub_partition_weights = [None] * num_sub_partitions
-
-        for rank, partition_weights in enumerate(all_partition_fp32_weights):
-            for comm_idx, sub_partition_weights in enumerate(partition_weights):
-                #all_sub_partition_weights.append(sub_partition_weights)
-                sub_partition_idx = (comm_idx * num_partitions) + rank
-                all_sub_partition_weights[sub_partition_idx] = sub_partition_weights
-
-        flat_merged_weights = flatten_dense_tensors_sub_partition_aligned(
-            tensor_list=all_sub_partition_weights,
-            dp=dist.get_world_size(group=self.dp_process_group),
-            max_elements_per_comm=max_elems_per_comm,
-            pg=self.dp_process_group)
-
-        comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
-            self.get_data_parallel_sub_partitions(
-                tensor=flat_merged_weights,
-                max_elements_per_comm=max_elems_per_comm,
-                world_size=dist.get_world_size(group=self.dp_process_group),
-                dp_process_group=self.dp_process_group
-            )
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        return [sub_partition for sub_partition in dp_sub_partitions[partition_id]]
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-    def _restore_from_fp32_weights(self, all_state_dict):
-        sub_partition_of_fp32_groups = []
-        for group_idx in range(len(self.local_sub_partitions_of_fp32_groups)):
-            all_partition_fp32_weights = [
-                sd['local_sub_partitions_of_fp32_groups'][group_idx]
-                for sd in all_state_dict
-            ]
-            max_elems_per_comm = self.max_elems_per_comm[group_idx]
-
-            sub_partition_weights = self._retrieve_group_sub_partition_weights(
-                all_partition_fp32_weights,
-                max_elems_per_comm)
-            sub_partition_of_fp32_groups.append(sub_partition_weights)
-
-        for current_group, saved_group in zip(self.local_sub_partitions_of_fp32_groups, sub_partition_of_fp32_groups):
-            for current_sub_part, saved_sub_part in zip(current_group, saved_group):
-                current_sub_part.data.copy_(saved_sub_part.data)
-
-    # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self,
-                                        state_key,
-                                        all_partition_states,
-                                        max_elems_per_comm):
-        if not torch.is_tensor(all_partition_states[0]):
-            return all_partition_states[0]
-
-        alignment = dist.get_world_size(group=self.dp_process_group)
-        flat_merged_partitions = flatten_dense_tensors_sub_partition_aligned(
-            tensor_list=all_partition_states,
-            dp=dist.get_world_size(group=self.dp_process_group),
-            max_elements_per_comm=max_elems_per_comm,
-            pg=self.dp_process_group)
-
-        comm_partitions, dp_sub_partitions, element_intervals, sub_partition_size, num_comm_intervals = \
-            self.get_data_parallel_sub_partitions(
-                tensor=flat_merged_partitions,
-                max_elements_per_comm=max_elems_per_comm,
-                world_size=dist.get_world_size(group=self.dp_process_group),
-                dp_process_group=self.dp_process_group
-            )
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        return [sub_partition for sub_partition in dp_sub_partitions[partition_id]]
-
-    # Compute the optimizer state partitions for the group by
-    # 1) Merging state values across the previous partitioning.
-    # 2) Repartition state values for the new partitioning
-    # 3) Return state corresponding to local partition
-    def _retrieve_group_optimizer_states(self, all_partition_states, max_elems_per_comm):
-        merged_optimizer_states = {}
-        num_partitions = len(all_partition_states)
-        num_comm_intervals = len(all_partition_states[0])
-        num_sub_partitions = num_partitions * num_comm_intervals
-
-        for rank, partition_state in enumerate(all_partition_states):
-            for comm_idx, sub_partition_state in enumerate(partition_state):
-                for key, value in sub_partition_state.items():
-                    if not key in merged_optimizer_states.keys():
-                        merged_optimizer_states[key] = [None] * num_sub_partitions
-
-                    sub_partition_idx = (comm_idx * num_partitions) + rank
-                    merged_optimizer_states[key][sub_partition_idx] = value
-
-        group_optimizer_states = {}
-        for key, value in merged_optimizer_states.items():
-            group_optimizer_states[key] = self._partition_base_optimizer_state(
-                key,
-                value,
-                max_elems_per_comm)
-
-        return group_optimizer_states
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, state_dict_list):
-        base_optimizer_group_states = []
-        for group_idx in range(len(self.optimizer.param_groups)):
-            all_partition_group_states = [
-                sd['base_optimizer_state'][group_idx] for sd in state_dict_list
-            ]
-            max_elems_per_comm = self.max_elems_per_comm[group_idx]
-            group_optimizer_states = self._retrieve_group_optimizer_states(
-                all_partition_group_states,
-                max_elems_per_comm)
-            base_optimizer_group_states.append(group_optimizer_states)
-
-        for group_idx, group in enumerate(self.optimizer.param_groups):
-            for param_idx, param in enumerate(group['params']):
-                for key, saved in base_optimizer_group_states[group_idx].items():
-                    if torch.is_tensor(self.optimizer.state[param][key]):
-                        current = self.optimizer.state[param][key]
-                        current.data.copy_(saved[param_idx].data)
-                    else:
-                        self.optimizer.state[param][key] = saved
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for fp16_partitions, fp32_partitions in zip(self.parallel_sub_partitioned_fp16_groups, self.local_sub_partitions_of_fp32_groups):
-            for fp16_sub_partition, fp32_sub_partition in zip(fp16_partitions[partition_id], fp32_partitions):
-                fp32_sub_partition.data.copy_(fp16_sub_partition.data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
-
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict['loss_scaler']
-        self.dynamic_loss_scale = state_dict['dynamic_loss_scale']
-        self.overflow = state_dict['overflow']
-        if load_optimizer_states:
-            self.optimizer.load_state_dict(state_dict['base_optimizer_state'])
-
-        for curr_group, saved_group in zip(self.local_sub_partitions_of_fp32_groups, state_dict['local_sub_partitions_of_fp32_groups']):
-            for curr_param, saved_param in zip(curr_group, saved_group):
-                curr_param.data.copy_(saved_param.data)
-
-    def _elastic_load_state_dict(self,
-                                 state_dict_list,
-                                 load_optimizer_states=True,
-                                 load_from_fp32_weights=False):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict_list[0]['loss_scaler']
-        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
-        self.overflow = state_dict_list[0]['overflow']
-
-        if load_optimizer_states:
-            self._restore_base_optimizer_state(state_dict_list)
-
-        if load_from_fp32_weights:
-            self._restore_from_fp32_weights(state_dict_list)
-        else:
-            self._restore_from_fp16_weights()
-
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        if self.elastic_checkpoint:
-            self._elastic_load_state_dict(state_dict_list,
-                                          load_optimizer_states,
-                                          load_from_fp32_weights)
-        else:
-            self._rigid_load_state_dict(
-                state_dict_list[dist.get_rank(group=self.dp_process_group)],
-                load_optimizer_states)
-
-    def _dump_optimizer_state(self, message):
-        logger.info(f'{message}')
-        for i, group in enumerate(self.optimizer.param_groups):
-            for j, param in enumerate(group['params']):
-                for key, value in self.optimizer.state[param].items():
-                    t_stats = [
-                        value.min(),
-                        value.max(),
-                        (value.max() - value.min()),
-                        value.mean()
-                    ]
-                    stats = [float(t) for t in t_stats]
-                    logger.info(
-                        f'group/param/key/min/max/delta/mean = {i}, {j}, {key}: {stats}')
diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
deleted file mode 100755
index cd29625958c9e56bd631bacaa12dc2b7146ae8b4..0000000000000000000000000000000000000000
--- a/deepspeed/runtime/zero/stage2.py
+++ /dev/null
@@ -1,1882 +0,0 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
-
-import torch
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from torch.distributed.distributed_c10d import _get_global_rank
-import torch.distributed as dist
-import math
-from torch._six import inf
-from torch.autograd import Variable
-
-import collections
-
-from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
-from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
-from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
-from deepspeed.ops.adam import DeepSpeedCPUAdam
-
-from deepspeed.utils import logger
-from ...ops.op_builder import UtilsBuilder
-
-#Toggle this to true to enable correctness test
-#with gradient partitioning and without
-pg_correctness_test = False
-
-
-def input(msg):
-    return
-
-
-def split_half_float_double(tensors):
-    dtypes = [
-        "torch.cuda.HalfTensor",
-        "torch.cuda.FloatTensor",
-        "torch.cuda.DoubleTensor"
-    ]
-    buckets = []
-    for i, dtype in enumerate(dtypes):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
-    return buckets
-
-
-def isclose(a, b, rtol=1e-09, atol=0.0):
-    return abs(a - b) <= max(rtol * max(abs(a), abs(b)), atol)
-
-
-def lcm(x, y):
-    from fractions import gcd  # or can import gcd from `math` in Python 3
-    return x * y // gcd(x, y)
-
-
-# create a flat tensor aligned at the alignment boundary
-def flatten_dense_tensors_aligned(tensor_list, alignment):
-    num_elements = 0
-    for tensor in tensor_list:
-        num_elements = num_elements + tensor.numel()
-
-    remaining = num_elements % alignment
-
-    if remaining:
-        elements_to_add = alignment - remaining
-        pad_tensor = torch.zeros(elements_to_add,
-                                 device=tensor_list[0].device,
-                                 dtype=tensor_list[0].dtype)
-        padded_tensor_list = tensor_list + [pad_tensor]
-
-        num_elements = num_elements + elements_to_add
-    else:
-        padded_tensor_list = tensor_list
-
-    return _flatten_dense_tensors(padded_tensor_list)
-
-
-def get_alignment_padding(tensor_list, alignment):
-    num_elements = sum([tensor.numel() for tensor in tensor_list])
-    remainder = num_elements % alignment
-    return (alignment - remainder) if remainder else remainder
-
-
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
-def print_rank_msg(msg):
-    print(f"rank {dist.get_rank()} - {msg}")
-
-
-class FP16_DeepSpeedZeroOptimizer(object):
-    """
-    DeepSpeedZeroOptimizer designed to reduce the memory footprint
-    required for training large deep learning models.
-
-    For more details please see ZeRO: Memory Optimization Towards Training A Trillion Parameter Models
-    https://arxiv.org/abs/1910.02054
-
-    For usage examples, refer to TODO: DeepSpeed Tutorial
-
-    """
-    def __init__(self,
-                 init_optimizer,
-                 timers,
-                 static_loss_scale=1.0,
-                 dynamic_loss_scale=False,
-                 dynamic_loss_args=None,
-                 verbose=True,
-                 contiguous_gradients=True,
-                 reduce_bucket_size=500000000,
-                 allgather_bucket_size=5000000000,
-                 dp_process_group=None,
-                 reduce_scatter=True,
-                 overlap_comm=False,
-                 cpu_offload=False,
-                 mpu=None,
-                 clip_grad=0.0,
-                 allreduce_always_fp32=False,
-                 postscale_gradients=True,
-                 gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1):
-
-        # Load pre-installed or JIT compile (un)flatten ops
-        util_ops = UtilsBuilder().load()
-        self.flatten = util_ops.flatten
-        self.unflatten = util_ops.unflatten
-
-        if dist.get_rank() == 0:
-            logger.info(f"Reduce bucket size {reduce_bucket_size}")
-            logger.info(f"Allgather bucket size {allgather_bucket_size}")
-            logger.info(f"CPU Offload: {cpu_offload}")
-        # The fused optimizer does all the work. We need this layer for two reason:
-        # 1. maintain same user API from apex.fp16_utils
-        # 2. keep common stuff here in case we need to add ne552w fused optimizer later
-
-        # differences from apex.fp16_utils:
-        # - assume all model params in fp16
-        # - assume all params requires grad
-        # - flat by groups, not keeping state. TODO: remove state explicitly?
-        # - master gard and unflat master weight never exist. TODO: a way to save out unflat master?
-        if not torch.cuda.is_available:
-            raise SystemError("Cannot use fp16 without CUDA.")
-        self.optimizer = init_optimizer
-
-        self.timers = timers
-
-        self.reduce_scatter = reduce_scatter
-
-        self.overlap_comm = overlap_comm
-
-        self.cpu_offload = cpu_offload
-
-        self.deepspeed_adam_offload = cpu_offload
-
-        self.device = torch.cuda.current_device() if not self.cpu_offload else 'cpu'
-
-        self.dp_process_group = dp_process_group
-
-        self.partition_count = dist.get_world_size(group=self.dp_process_group)
-
-        if mpu is None:
-            self.model_parallel_group = None
-            self.model_parallel_rank = 0
-        else:
-            self.model_parallel_group = mpu.get_model_parallel_group()
-            self.model_parallel_rank = mpu.get_model_parallel_rank()
-
-        self.overflow = False
-        self.clip_grad = clip_grad
-        self.allreduce_always_fp32 = allreduce_always_fp32
-        self.gradient_predivide_factor = gradient_predivide_factor
-        self.postscale_gradients = postscale_gradients
-        self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
-
-        if self.reduce_scatter:
-            assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
-
-        # param flattened by groups
-        self.fp16_groups = []
-        self.fp16_groups_flat = []
-
-        #param partitioned by data parallel degree
-        #this will contain a list of equal sized tensors
-        #each of which will be updated by a different process
-        self.parallel_partitioned_fp16_groups = []
-
-        #a single 32-bit partition of the parallel partitioned parameters
-        #that this process will update
-        self.single_partition_of_fp32_groups = []
-
-        #param partition info
-
-        #These are the parameters in each group that will not be updated by this process directly
-        self.params_not_in_partition = []
-
-        #These are the parameters that will be updated by this process directly
-        self.params_in_partition = []
-
-        #Offset from the first paramter in the the self.params_in_partition
-        #the parameter boundaries may not align with partition boundaries
-        #so we need to keep track of the offset
-        self.first_offset = []
-
-        #number of elements per partition in each group
-        self.partition_size = []
-
-        partition_id = dist.get_rank(group=self.dp_process_group)
-
-        self.all_reduce_print = False
-
-        # padding on each partition for alignment purposes
-        self.groups_padding = []
-        # loop to deal with groups
-        for i, param_group in enumerate(self.optimizer.param_groups):
-            # push this group to list before modify
-            self.fp16_groups.append(param_group['params'])
-            # Record padding required to align group to world size
-            if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                padding = get_alignment_padding(self.fp16_groups[i],
-                                                self.partition_count)
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
-
-            #not sure why apex was cloning the weights before flattening
-            #removing cloning here
-
-            see_memory_usage(f"Before moving param group {i} to CPU")
-            #move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.fp16_groups[i])
-            see_memory_usage(f"After moving param group {i} to CPU")
-
-            #create flat buffer in CPU and move to GPU
-            self.fp16_groups_flat.append(
-                flatten_dense_tensors_aligned(
-                    self.fp16_groups[i],
-                    dist.get_world_size(group=self.dp_process_group)).cuda(
-                        torch.cuda.current_device()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU")
-
-            if dist.get_rank(group=self.dp_process_group) == 0:
-                see_memory_usage(
-                    f"After Flattening and after emptying param group {i} cache")
-
-            # set model fp16 weight to slices of flattened buffer
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-            #divide the flat weights into near equal partition equal to the data parallel degree
-            #each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.fp16_groups_flat[i])
-            self.parallel_partitioned_fp16_groups.append(data_parallel_partitions)
-
-            # a partition of the fp32 master weights that will be updated by this process
-            self.single_partition_of_fp32_groups.append(
-                self.parallel_partitioned_fp16_groups[i][partition_id].to(
-                    self.device).clone().float().detach())
-
-            # modify optimizer of have flat master weight
-            self.single_partition_of_fp32_groups[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
-            param_group['params'] = [self.single_partition_of_fp32_groups[i]]
-
-            partition_size = len(self.fp16_groups_flat[i]) / dist.get_world_size(
-                group=self.dp_process_group)
-            params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(self.fp16_groups[i], partition_size, partition_id)
-
-            self.partition_size.append(partition_size)
-            self.params_in_partition.append(params_in_partition)
-            self.params_not_in_partition.append(params_not_in_partition)
-            self.first_offset.append(first_offset)
-
-        self.reduce_bucket_size = int(reduce_bucket_size)
-        self.allgather_bucket_size = int(allgather_bucket_size)
-
-        self.reduction_event = torch.cuda.Event(enable_timing=False, blocking=False)
-        self.reduction_stream = torch.cuda.Stream()
-        self.cpu_computation_stream = torch.cuda.Stream()
-        self.migration_stream = torch.cuda.Stream()
-        self.callback_queued = False
-
-        self.param_dict = {}
-
-        #map between param_id and bool to specify if a param is in this partition
-        self.is_param_in_current_partition = {}
-
-        # CPU-Offload requires contiguous gradients
-        self.contiguous_gradients = contiguous_gradients or cpu_offload
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        self.params_already_reduced = []
-        self._release_ipg_buffers()
-        self.previous_reduced_grads = None
-
-        #simplified param id
-        self.param_id = {}
-
-        largest_param_numel = 0
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
-            for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                self.params_already_reduced.append(False)
-                if param.numel() > largest_param_numel:
-                    largest_param_numel = param.numel()
-                count = count + 1
-
-        for param_group in self.params_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = True
-
-        for param_group in self.params_not_in_partition:
-            for param in param_group:
-                self.is_param_in_current_partition[self.get_param_id(param)] = False
-
-        if self.cpu_offload:
-            self.accumulated_grads_in_cpu = {}
-            self.norm_for_param_grads = {}
-            self.local_overflow = False
-            self.grad_position = {}
-            self.temp_grad_buffer_for_cpu_offload = torch.zeros(
-                largest_param_numel,
-                device=self.device).half().pin_memory()
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_param_numel,
-                device=torch.cuda.current_device()).half()
-
-            for i, params_group in enumerate(self.fp16_groups):
-                self.get_grad_position(i,
-                                       self.params_in_partition[i],
-                                       self.first_offset[i],
-                                       self.partition_size[i])
-
-        #mapping from parameter to partition that it belongs to
-        self.param_to_partition_ids = {}
-
-        #stores if a partition has been reduced in this step
-        self.is_partition_reduced = {}
-
-        #number of grads in partition that still need to be computed
-        self.remaining_grads_in_partition = {}
-
-        #total number of grads in partition
-        self.total_grads_in_partition = {}
-
-        #stores if a grad in a partition has been computed or not
-        self.is_grad_computed = {}
-
-        #stores the offset at which a parameter gradient needs to be inserted in a partition
-        self.grad_partition_insertion_offset = {}
-
-        #the offset in the gradient at which it must be inserted at the beginning of the partition
-        self.grad_start_offset = {}
-
-        #will store the averaged gradients required by this partition
-        self.averaged_gradients = {}
-
-        # store index of first parameter in each partition
-        self.first_param_index_in_partition = {}
-
-        #initializes all data structures for implementing gradient partitioning
-        self.initialize_gradient_partitioning_data_structures()
-
-        #resets the data structure value for the next backward propagation
-        self.reset_partition_gradient_structures()
-
-        #creates backward hooks for gradient partitioning
-        self.create_reduce_and_remove_grad_hooks()
-
-        # we may have a way of fusing dynamic scale. Do not support for now
-        if dynamic_loss_scale:
-            if dynamic_loss_args is None:
-                self.loss_scaler = DynamicLossScaler()
-            else:
-                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
-
-            self.dynamic_loss_scale = True
-
-        else:
-            self.dynamic_loss_scale = False
-            self.loss_scaler = LossScaler(scale=static_loss_scale)
-            self.cur_iter = 0
-
-        see_memory_usage("Before initializing optimizer states")
-        self.initialize_optimizer_states()
-        see_memory_usage("After initializing optimizer states")
-
-        if dist.get_rank() == 0:
-            logger.info(f"optimizer state initialized")
-
-        if dist.get_rank(group=self.dp_process_group) == 0:
-            see_memory_usage(f"After initializing ZeRO optimizer")
-
-    def _release_ipg_buffers(self):
-        if self.contiguous_gradients:
-            self.ipg_buffer = None
-            self.grads_in_partition = None
-            self.grads_in_partition_offset = 0
-
-    def initialize_optimizer_states(self):
-
-        for i, group in enumerate(self.fp16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=self.device)
-            self.single_partition_of_fp32_groups[
-                i].grad = single_grad_partition.pin_memory(
-                ) if self.cpu_offload else single_grad_partition
-
-        self.optimizer.step()
-
-        if not self.cpu_offload:
-            for group in self.single_partition_of_fp32_groups:
-                group.grad = None
-
-        return
-
-    #########################################################################
-    #########################ZeRO Partition Gradients########################
-    #########################################################################
-
-    def get_first_param_index(self, group_id, param_group, partition_id):
-        for index, param in enumerate(param_group):
-            param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
-        return None
-
-    def initialize_gradient_partitioning_data_structures(self):
-
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-
-        for i, param_group in enumerate(self.fp16_groups):
-
-            self.param_to_partition_ids[i] = {}
-            self.is_partition_reduced[i] = {}
-            self.total_grads_in_partition[i] = {}
-            self.remaining_grads_in_partition[i] = {}
-            self.is_grad_computed[i] = {}
-            self.grad_partition_insertion_offset[i] = {}
-            self.grad_start_offset[i] = {}
-            self.first_param_index_in_partition[i] = {}
-
-            for partition_id in range(total_partitions):
-                self.is_grad_computed[i][partition_id] = {}
-                self.grad_partition_insertion_offset[i][partition_id] = {}
-                self.grad_start_offset[i][partition_id] = {}
-                self.total_grads_in_partition[i][partition_id] = 0
-                self.initialize_gradient_partition(i, param_group, partition_id)
-                self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
-
-    def independent_gradient_partition_epilogue(self):
-        self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
-        self.reduce_ipg_grads()
-        self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
-
-        #if dist.get_rank() == 0:
-        #    logger.info("Params already reduced %s", self.params_already_reduced)
-        for i in range(len(self.params_already_reduced)):
-            self.params_already_reduced[i] = False
-
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            # It is safe to clear previously reduced grads of other partitions
-            self._clear_previous_reduced_grads()
-
-        if self.cpu_offload is False:
-            for i, _ in enumerate(self.fp16_groups):
-
-                if not i in self.averaged_gradients or self.averaged_gradients[i] is None:
-                    self.averaged_gradients[i] = self.get_flat_partition(
-                        self.params_in_partition[i],
-                        self.first_offset[i],
-                        self.partition_size[i],
-                        dtype=torch.half,
-                        device=torch.cuda.current_device(),
-                        return_tensor_list=True)
-                else:
-                    avg_new = self.get_flat_partition(self.params_in_partition[i],
-                                                      self.first_offset[i],
-                                                      self.partition_size[i],
-                                                      dtype=torch.half,
-                                                      device=torch.cuda.current_device(),
-                                                      return_tensor_list=True)
-
-                    for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new):
-                        accumulated_grad.add_(new_avg_grad)
-
-        self._release_ipg_buffers()
-
-        # No need to keep the gradients anymore.
-        # All gradients required by the step
-        # are in self.averaged_gradients
-        self.zero_grad()
-        see_memory_usage(f"End ipg_epilogue")
-
-    # resets all partition to no reduced
-    # sets remaining grads to the total number of grads in each partition
-    # set is grad computed to false for all grads in partition
-    def reset_partition_gradient_structures(self):
-        total_partitions = dist.get_world_size(group=self.dp_process_group)
-        for i, _ in enumerate(self.fp16_groups):
-            for partition_id in range(total_partitions):
-                self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
-
-                for param_id in self.is_grad_computed[i][partition_id]:
-                    self.is_grad_computed[i][partition_id][param_id] = False
-
-    def initialize_gradient_partition(self, i, param_group, partition_id):
-        def set_key_value_list(dictionary, key, value):
-            if key in dictionary:
-                dictionary[key].append(value)
-            else:
-                dictionary[key] = [value]
-
-        def increment_value(dictionary, key):
-            if key in dictionary:
-                dictionary[key] += 1
-            else:
-                dictionary[key] = 1
-
-        partition_size = self.partition_size[i]
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for param in param_group:
-
-            param_size = param.numel()
-            param_id = self.get_param_id(param)
-
-            if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
-                self.grad_start_offset[i][partition_id][param_id] = 0
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
-                increment_value(self.total_grads_in_partition[i], partition_id)
-
-                self.is_grad_computed[i][partition_id][param_id] = False
-
-                self.grad_partition_insertion_offset[i][partition_id][param_id] = 0
-                self.grad_start_offset[i][partition_id][param_id] = first_offset
-
-            current_index = current_index + param_size
-
-    def overlapping_partition_gradients_reduce_epilogue(self):
-        self.independent_gradient_partition_epilogue()
-
-    def create_reduce_and_remove_grad_hooks(self):
-        self.grad_accs = []
-        for i, param_group in enumerate(self.fp16_groups):
-            for param in param_group:
-                if param.requires_grad:
-
-                    def wrapper(param, i):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
-
-                        def reduce_partition_and_remove_grads(*notneeded):
-                            self.reduce_ready_partitions_and_remove_grads(param, i)
-
-                        grad_acc.register_hook(reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
-
-                    wrapper(param, i)
-
-    def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
-
-    def report_ipg_memory_usage(self, tag, param_elems):
-        elem_count = self.elements_in_ipg_bucket + param_elems
-        percent_of_bucket_size = (100.0 * elem_count) // self.reduce_bucket_size
-        see_memory_usage(
-            f"{tag}: elems in_bucket {self.elements_in_ipg_bucket} param {param_elems} max_percent {percent_of_bucket_size}"
-        )
-
-    ############### Independent Partition Gradient ########################
-    def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
-        if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
-            self.reduce_ipg_grads()
-            if self.contiguous_gradients and self.overlap_comm:
-                # Swap ipg_index between 0 and 1
-                self.ipg_index = 1 - self.ipg_index
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
-
-        param_id = self.get_param_id(param)
-
-        assert self.params_already_reduced[param_id] == False, \
-            f"The parameter {param_id} has already been reduced. \
-            Gradient computed twice for this partition. \
-            Multiple gradient reduction is currently not supported"
-
-        #keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-        if self.contiguous_gradients:
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
-
-        self.elements_in_ipg_bucket += param.numel()
-
-        assert param.grad is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
-
-        self.grads_in_ipg_bucket.append(param.grad)
-        self.params_in_ipg_bucket.append((i, param, param_id))
-
-        self.report_ipg_memory_usage("End ipg_remove_grads", 0)
-
-    def print_rank_0(self, message):
-        if dist.get_rank() == 0:
-            logger.info(message)
-
-    def gradient_reduction_w_predivide(self, tensor):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-
-        tensor_to_allreduce = tensor
-
-        if self.allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        if self.postscale_gradients:
-            if self.gradient_predivide_factor != 1.0:
-                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)
-
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-            if self.gradient_predivide_factor != dp_world_size:
-                tensor_to_allreduce.mul_(self.gradient_predivide_factor / dp_world_size)
-        else:
-            tensor_to_allreduce.div_(dp_world_size)
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-
-        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def average_tensor(self, tensor):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            if not self.reduce_scatter:
-                self.gradient_reduction_w_predivide(tensor)
-                return
-
-            # Accumulate destination ranks and bucket offsets for each gradient slice.
-            # Note: potential future optimization, record access pattern of parameters
-            # in backward pass and partition gradients w.r.t. access pattern so that our
-            # bucket is guaranteed to be contiguous w.r.t. ranks
-            rank_and_offsets = []
-            curr_size = 0
-            prev_id = -1
-            for i, param, param_id in self.params_in_ipg_bucket:
-                partition_ids = self.param_to_partition_ids[i][param_id]
-                partition_size = self.partition_size[i]
-                # Get all partition ids + their offsets
-                partition_ids_w_offsets = []
-                for partition_id in partition_ids:
-                    offset = self.grad_start_offset[i][partition_id][param_id]
-                    partition_ids_w_offsets.append((partition_id, offset))
-                partition_ids_w_offsets.sort(key=lambda t: t[1])
-
-                # Calculate rank and offsets for grad slices
-                for idx in range(len(partition_ids_w_offsets)):
-                    partition_id, offset = partition_ids_w_offsets[idx]
-
-                    # Calculate numel for grad slice depending on partition location
-                    if idx == len(partition_ids_w_offsets) - 1:
-                        # Last partition_id uses its own offset
-                        numel = param.numel() - offset
-                    else:
-                        # Set numel to next partition's offset
-                        numel = partition_ids_w_offsets[idx + 1][1] - offset
-
-                    # Merge bucket ranges if they belong to the same rank
-                    if partition_id == prev_id:
-                        prev_pid, prev_size, prev_numel = rank_and_offsets[-1]
-                        rank_and_offsets[-1] = (prev_pid, prev_size, prev_numel + numel)
-                    else:
-                        rank_and_offsets.append((partition_id, curr_size, numel))
-
-                    curr_size += numel
-                    prev_id = partition_id
-            tensor.div_(dist.get_world_size(group=self.dp_process_group))
-
-            async_handles = []
-            for dst, bucket_offset, numel in rank_and_offsets:
-                grad_slice = tensor.narrow(0, int(bucket_offset), int(numel))
-                dst_rank = _get_global_rank(self.dp_process_group, dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=self.dp_process_group,
-                                           async_op=True)
-                async_handles.append(async_handle)
-
-            for handle in async_handles:
-                handle.wait()
-
-    ##############################################################################
-    ############################# CPU Offload Methods#############################
-    ##############################################################################
-    def get_grad_position(self, group_id, tensor_list, first_offset, partition_size):
-        current_offset = 0
-
-        for i, tensor in enumerate(tensor_list):
-            param_id = self.get_param_id(tensor)
-            param_start_offset = 0
-
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            #we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-                param_start_offset = first_offset
-
-            #we dont need all elements of the tensor
-            if num_elements > (partition_size - current_offset):
-                num_elements = partition_size - current_offset
-
-            self.grad_position[param_id] = [
-                int(group_id),
-                int(param_start_offset),
-                int(current_offset),
-                int(num_elements)
-            ]
-            current_offset += num_elements
-
-    def update_overflow_tracker_for_param_grad(self, param):
-        if param.grad is not None and self._has_inf_or_nan(param.grad.data):
-            self.local_overflow = True
-
-    def async_accumulate_grad_in_cpu(self, param):
-        param_id = self.get_param_id(param)
-
-        #copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_cpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
-        dest_buffer.copy_(param.grad.view(-1), non_blocking=True)
-
-        if param_id not in self.accumulated_grads_in_cpu:
-            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
-                param.numel(),
-                dtype=param.dtype,
-                device=self.device).pin_memory()
-
-        self.accumulated_grads_in_cpu[param_id].add_(dest_buffer)
-
-    def async_accumulate_grad_in_cpu_via_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        #copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
-
-        if param_id not in self.accumulated_grads_in_cpu:
-            self.accumulated_grads_in_cpu[param_id] = torch.zeros(
-                param.numel(),
-                dtype=param.dtype,
-                device=self.device).pin_memory()
-
-        if self.micro_step_id > 0:
-            dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
-                              non_blocking=True)
-            param.grad.data.view(-1).add_(dest_buffer)
-
-        #at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
-            self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1),
-                                                               non_blocking=True)
-
-    def set_norm_for_param_grad(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = self.accumulated_grads_in_cpu[
-            param_id] if self.gradient_accumulation_steps > 1 else param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
-
-    def set_norm_for_param_grad_in_gpu(self, param):
-        param_id = self.get_param_id(param)
-        accumulated_grad = param.grad
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        start = source_offset
-        accumulated_grad = accumulated_grad.view(-1).narrow(0, start, num_elements)
-
-        self.norm_for_param_grads[param_id] = accumulated_grad.data.double().norm(2)
-
-    def async_inplace_copy_grad_to_fp32_buffer(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
-        if self.gradient_accumulation_steps > 1:
-            src_tensor = self.accumulated_grads_in_cpu[param_id].view(-1).narrow(
-                0,
-                source_offset,
-                num_elements)
-        else:
-            src_tensor = param.grad.view(-1).narrow(0,
-                                                    source_offset,
-                                                    num_elements).float()
-        dest_tensor.copy_(src_tensor, non_blocking=True)
-
-    def async_inplace_copy_grad_to_fp32_buffer_from_gpu(self, param):
-        param_id = self.get_param_id(param)
-
-        [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
-
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
-
-        src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements).float()
-        dest_tensor.copy_(src_tensor, non_blocking=True)
-        param.grad = None
-
-    def complete_grad_norm_calculation_for_cpu_offload(self, params):
-        total_norm = 0.0
-        norm_type = 2.0
-        for p in params:
-            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                param_id = self.get_param_id(p)
-                # as some model have trainable parameters but skipped in training,
-                # their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
-                # so they have no norm_for_param_grads
-                if param_id in self.norm_for_param_grads:
-                    param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item()**2
-
-        # Sum across all model parallel GPUs.
-        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-        torch.distributed.all_reduce(total_norm_cuda,
-                                     op=torch.distributed.ReduceOp.SUM,
-                                     group=self.dp_process_group)
-
-        self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                        op=torch.distributed.ReduceOp.SUM)
-
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    ############################################################################################
-
-    def copy_grads_in_partition(self, param):
-        if self.cpu_offload:
-
-            if self.gradient_accumulation_steps > 1:
-                self.async_accumulate_grad_in_cpu_via_gpu(param)
-
-            if self.is_gradient_accumulation_boundary:
-                self.set_norm_for_param_grad_in_gpu(param)
-
-                self.update_overflow_tracker_for_param_grad(param)
-
-                self.async_inplace_copy_grad_to_fp32_buffer_from_gpu(param)
-
-            return
-        #print(f"ID {self.get_param_id(param)} grad norm {param.grad.norm()}")
-        if self.grads_in_partition is None:
-            self.grads_in_partition_offset = 0
-            total_size = 0
-            for group in self.params_in_partition:
-                for param_in_partition in group:
-                    total_size += param_in_partition.numel()
-
-            see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(int(total_size),
-                                                  dtype=torch.half,
-                                                  device=torch.cuda.current_device())
-            see_memory_usage(f"after copying {total_size} gradients into partition")
-
-        #The allreduce buffer will be rewritted. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
-            0,
-            self.grads_in_partition_offset,
-            param.numel())
-        new_grad_tensor.copy_(param.grad.view(-1))
-        param.grad.data = new_grad_tensor.data.view_as(param.grad)
-        #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
-        self.grads_in_partition_offset += param.numel()
-
-    def reduce_ipg_grads(self):
-        if self.overlap_comm:
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        if self.contiguous_gradients:
-            self.average_tensor(self.ipg_buffer[self.ipg_index])
-        else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
-
-        with torch.cuda.stream(stream):
-            for _, param, param_id in self.params_in_ipg_bucket:
-
-                assert self.params_already_reduced[param_id] == False, \
-                    f"The parameter {param_id} has already been reduced. \
-                    Gradient computed twice for this partition. \
-                    Multiple gradient reduction is currently not supported"
-
-                self.params_already_reduced[param_id] = True
-
-                if not self.is_param_in_current_partition[param_id]:
-                    if self.overlap_comm and self.contiguous_gradients is False:
-                        # Clear grads of other partitions during the next reduction
-                        # to avoid clearing them before the reduction is complete.
-                        if self.previous_reduced_grads is None:
-                            self.previous_reduced_grads = []
-                        self.previous_reduced_grads.append(param)
-                    else:
-                        param.grad = None
-                elif self.contiguous_gradients:
-                    self.copy_grads_in_partition(param)
-
-        self.grads_in_ipg_bucket = []
-        self.params_in_ipg_bucket = []
-        self.elements_in_ipg_bucket = 0
-        #####################################################################
-
-    def reduce_ready_partitions_and_remove_grads(self, param, i):
-        self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
-
-    def zero_reduced_gradients(self, partition_id, i):
-        def are_all_related_partitions_reduced(params_id):
-            for partition_id in self.param_to_partition_ids[i][params_id]:
-                if not self.is_partition_reduced[i][partition_id]:
-                    return False
-            return True
-
-        for params_id in self.is_grad_computed[i][partition_id]:
-            if are_all_related_partitions_reduced(params_id):
-                self.param_dict[params_id].grad = None
-
-    def flatten_and_print(self, message, tensors, start=0, n=5):
-        flatten_tensor = _flatten_dense_tensors(tensors)
-
-        def print_func():
-            logger.info(flatten_tensor.contiguous().view(-1).narrow(0, start, n))
-
-        self.sequential_execution(print_func, message)
-
-    def get_grads_to_reduce(self, i, partition_id):
-        def get_reducable_portion(key):
-            grad = self.param_dict[key].grad
-            total_elements = grad.numel()
-            start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
-            if not pg_correctness_test:
-                if num_elements == total_elements:
-                    return grad
-                else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
-            else:
-                if num_elements == total_elements:
-                    return grad.clone()
-                else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
-
-        grads_to_reduce = []
-        for key in self.is_grad_computed[i][partition_id]:
-            grad = get_reducable_portion(key)
-            grads_to_reduce.append(grad)
-        return grads_to_reduce
-
-    def sequential_execution(self, function, message, group=None):
-        if group is None:
-            group = self.dp_process_group
-        if dist.get_rank(group=group) == 0:
-            logger.info(message)
-        for id in range(dist.get_world_size(group=group)):
-            if id == dist.get_rank(group=group):
-                function()
-            dist.barrier(group=group)
-
-    def set_none_gradients_to_zero(self, i, partition_id):
-        for param_id in self.is_grad_computed[i][partition_id]:
-            param = self.param_dict[param_id]
-            if param.grad is None:
-                param.grad = torch.zero_like(param)
-
-    ######################Reduction Related Methods##############################
-
-    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
-        rank = None
-        tensor = self.flatten(bucket)
-
-        tensor_to_allreduce = tensor
-
-        if pg_correctness_test:
-            allreduce_always_fp32 = True
-
-        if allreduce_always_fp32:
-            tensor_to_allreduce = tensor.float()
-
-        tensor_to_allreduce.div_(dist.get_world_size(group=self.dp_process_group))
-
-        if rank is None:
-            #    "All Reducing"
-            dist.all_reduce(tensor_to_allreduce, group=self.dp_process_group)
-        else:
-            global_rank = _get_global_rank(self.dp_process_group, rank)
-            dist.reduce(tensor_to_allreduce, global_rank, group=self.dp_process_group)
-
-        if allreduce_always_fp32 and tensor is not tensor_to_allreduce:
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                tensor.copy_(tensor_to_allreduce)
-
-        return tensor
-
-    def _clear_previous_reduced_grads(self):
-        if self.previous_reduced_grads is not None:
-            for param in self.previous_reduced_grads:
-                param.grad = None
-            self.previous_reduced_grads = None
-
-    #if rank is specified do a reduction instead of an allreduce
-    def allreduce_and_copy(self, small_bucket, rank=None, log=None):
-        if self.overlap_comm:
-            torch.cuda.synchronize()
-            # It is safe to clear the previously reduced grads of other partitions
-            self._clear_previous_reduced_grads()
-            stream = self.reduction_stream
-        else:
-            stream = torch.cuda.current_stream()
-
-        with torch.cuda.stream(stream):
-            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
-            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
-                    buf.copy_(synced)
-
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
-        small_bucket = []
-        numel = 0
-        for tensor in bucket:
-            small_bucket.append(tensor)
-            numel = numel + tensor.numel()
-            if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, rank=rank, log=None)
-                small_bucket = []
-        if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, rank=rank, log=log)
-
-    #allows using reduction of gradients instead of using all_reduce
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
-        split_buckets = split_half_float_double(grads)
-
-        for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
-
-    #############################################################################
-    #############################################################################
-    #############################################################################
-
-    #views the tensor as multiple partitions and returns
-    #those partitions
-    def get_data_parallel_partitions(self, tensor):
-        partitions = []
-
-        dp = dist.get_world_size(group=self.dp_process_group)
-        dp_id = dist.get_rank(group=self.dp_process_group)
-
-        total_num_elements = tensor.numel()
-
-        base_size = total_num_elements // dp
-        remaining = total_num_elements % dp
-
-        start = 0
-        for id in range(dp):
-            partition_size = base_size
-            if id < remaining:
-                partition_size = partition_size + 1
-            partitions.append(tensor.narrow(0, start, partition_size))
-            start = start + partition_size
-        return partitions
-
-    def get_partition_info(self, tensor_list, partition_size, partition_id):
-        params_in_partition = []
-        params_not_in_partition = []
-
-        start_index = partition_size * partition_id
-        end_index = partition_size * (partition_id + 1)
-
-        current_index = 0
-        first_offset = 0
-
-        for tensor in tensor_list:
-
-            tensor_size = tensor.numel()
-
-            if (current_index >= start_index and current_index < end_index):
-                params_in_partition.append(tensor)
-
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
-                params_in_partition.append(tensor)
-
-                assert (first_offset==0), "This can happen either zero or only once as this must be the first tensor in the partition"
-                first_offset = start_index - current_index
-
-            else:
-                params_not_in_partition.append(tensor)
-
-            current_index = current_index + tensor_size
-
-        return params_in_partition, params_not_in_partition, first_offset
-
-    def zero_grad(self, set_grads_to_None=True):
-        """
-        Zero FP16 parameter grads.
-        """
-        # FP32 grad should never exist.
-        # For speed, set model fp16 grad to None by default
-        for group in self.fp16_groups:
-            for p in group:
-                if set_grads_to_None:
-                    p.grad = None
-                else:
-                    if p.grad is not None:
-                        p.grad.detach_()
-                        p.grad.zero_()
-
-    def _model_parallel_all_reduce(self, tensor, op):
-        """ Perform all reduce within model parallel group, if any.
-        """
-        if self.model_parallel_group is None:
-            pass
-        else:
-            torch.distributed.all_reduce(tensor=tensor,
-                                         op=op,
-                                         group=self.model_parallel_group)
-
-    def get_grad_norm_direct(self, gradients, params, norm_type=2):
-        """Clips gradient norm of an iterable of parameters.
-
-        This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
-        added functionality to handle model parallel parameters. Note that
-        the gradients are modified in place.
-
-        Arguments:
-            parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
-                single Tensor that will have gradients normalized
-            max_norm (float or int): max norm of the gradients
-            norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
-                infinity norm.
-
-        Returns:
-            Total norm of the parameters (viewed as a single vector).
-        """
-        norm_type = float(norm_type)
-        if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-            # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
-        else:
-            total_norm = 0.0
-            #if dist.get_rank() == 0:
-            #    logger.info(f"Total Norm begining {total_norm}")
-            for g, p in zip(gradients, params):
-                if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item()**2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
-
-            torch.distributed.all_reduce(total_norm_cuda,
-                                         op=torch.distributed.ReduceOp.SUM,
-                                         group=self.dp_process_group)
-
-            self._model_parallel_all_reduce(tensor=total_norm_cuda,
-                                            op=torch.distributed.ReduceOp.SUM)
-
-            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
-
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
-
-        return total_norm
-
-    #creates a flat fused tensor from the tensor list starting at the first_offset
-    #in the first tensor of the list. If there are not enough elements in the tensor
-    #list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
-        flat_tensor_list = []
-        current_size = 0
-        for i, tensor in enumerate(tensor_list):
-            if tensor.grad is None:
-                tensor.grad = torch.zeros_like(tensor)
-
-            tensor = tensor.grad
-            num_elements = tensor.numel()
-            tensor_offset = 0
-
-            #we need to offset to get to the right element
-            if i == 0 and first_offset > 0:
-                tensor_offset = first_offset
-                num_elements = num_elements - tensor_offset
-
-            #we dont need all elements of the tensor
-            if num_elements > (partition_size - current_size):
-                num_elements = partition_size - current_size
-
-            #we need a narrow view of the tensor based on the tensor offset and number of elements that
-            #we need from this tensor
-            if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
-            else:
-                flat_tensor_list.append(tensor)
-
-            current_size = current_size + num_elements
-
-        #this means its the last partition and does not align with the dp boundary. We need to pad before flattening
-        if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
-
-        if return_tensor_list:
-            return flat_tensor_list
-
-        return _flatten_dense_tensors(flat_tensor_list)
-
-    def free_grad_in_param_list(self, param_list):
-        for p in param_list:
-            p.grad = None
-
-    def reset_cpu_buffers(self):
-        self.norm_for_param_grads = {}
-        self.local_overflow = False
-
-    def log_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        self.timers.log(names=list(timer_names))
-
-    def start_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).start()
-
-    def stop_timers(self, timer_names):
-        if self.timers is None:
-            return
-
-        for name in timer_names:
-            self.timers(name).stop()
-
-    def step(self, closure=None):
-        """
-        Not supporting closure.
-        """
-        self.micro_step_id = -1
-
-        if self.cpu_offload:
-            torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        see_memory_usage(f"In step before checking overflow")
-
-        # First compute norm for all group so we know if there is overflow
-        self.check_overflow()
-
-        OPTIMIZER_ALLGATHER = 'optimizer_allgather'
-        OPTIMIZER_GRADIENTS = 'optimizer_gradients'
-        OPTIMIZER_STEP = 'optimizer_step'
-        timer_names = [OPTIMIZER_ALLGATHER, OPTIMIZER_GRADIENTS, OPTIMIZER_STEP]
-
-        prev_scale = self.loss_scale
-        self._update_scale(self.overflow)
-        if self.overflow:
-            see_memory_usage('After overflow before clearing gradients')
-            self.zero_grad()
-            if self.cpu_offload:
-                self.reset_cpu_buffers()
-            else:
-                self.averaged_gradients = {}
-
-            see_memory_usage('After overflow after clearing gradients')
-
-            logger.info(
-                "[deepspeed] fp16 dynamic loss scale overflow! Rank {} Skipping step. Attempted loss scale: {}, "
-                "reducing to {}".format(dist.get_rank(),
-                                        prev_scale,
-                                        self.loss_scale))
-            self.start_timers(timer_names)
-            self.stop_timers(timer_names)
-            return
-
-        self.start_timers([OPTIMIZER_GRADIENTS])
-        norm_groups = []
-        single_partition_grad_groups = []
-        skip = False
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for i, group in enumerate(self.fp16_groups):
-            if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
-                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-            else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
-
-                #free gradients for all the prameters that are not updated by this process
-                self.free_grad_in_param_list(self.params_not_in_partition[i])
-
-                #create a flat gradients for parameters updated by this process
-                # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                if partition_id == dist.get_world_size(group=self.dp_process_group) - 1:
-                    single_grad_partition = flatten_dense_tensors_aligned(
-                        self.averaged_gradients[i],
-                        int(self.partition_size[i])).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
-                else:
-                    single_grad_partition = _flatten_dense_tensors(
-                        self.averaged_gradients[i]).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
-                assert single_grad_partition.numel() == self.partition_size[i], \
-                    "averaged gradients have different number of elements that partition size {} {} {} {}".format(single_grad_partition.numel(), self.partition_size[i], i, partition_id)
-
-                self.single_partition_of_fp32_groups[i].grad = single_grad_partition
-                #release all the gradient since we have already created a necessary copy in dp_grad_partition
-                self.free_grad_in_param_list(self.params_in_partition[i])
-
-                self.averaged_gradients[i] = None
-
-            single_partition_grad_groups.append(single_grad_partition)
-
-        self.unscale_and_clip_grads(single_partition_grad_groups, norm_groups)
-        self.stop_timers([OPTIMIZER_GRADIENTS])
-
-        self.start_timers([OPTIMIZER_STEP])
-        if self.deepspeed_adam_offload:
-            from deepspeed.ops.adam import DeepSpeedCPUAdam
-            if type(self.optimizer) == DeepSpeedCPUAdam:
-                fp16_param_groups = [
-                    fp16_partitions[partition_id]
-                    for fp16_partitions in self.parallel_partitioned_fp16_groups
-                ]
-                self.optimizer.step(fp16_param_groups=fp16_param_groups)
-            else:
-                self.optimizer.step()
-                for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-                    fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-        else:
-            self.optimizer.step()
-
-            #get rid of the fp32 gradients. Not needed anymore
-            if not self.cpu_offload:
-                for group in self.single_partition_of_fp32_groups:
-                    group.grad = None
-
-            for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-                fp16_partitions[partition_id].data.copy_(fp32_partition.data)
-
-        self.stop_timers([OPTIMIZER_STEP])
-
-        if self.cpu_offload:
-            self.reset_cpu_buffers()
-
-        self.start_timers([OPTIMIZER_ALLGATHER])
-        #gather the updated weights from everyone
-        for group_id, partitioned_params in enumerate(self.parallel_partitioned_fp16_groups):
-
-            #Sequential AllGather Best of both worlds
-            dp_world_size = dist.get_world_size(group=self.dp_process_group)
-            num_shards = max(
-                1,
-                partitioned_params[partition_id].numel() * dp_world_size //
-                self.allgather_bucket_size)
-
-            shard_size = partitioned_params[partition_id].numel() // num_shards
-            num_elements = shard_size
-
-            assert shard_size * num_shards <= partitioned_params[partition_id].numel()
-
-            for shard_id in range(num_shards):
-
-                if shard_id == (num_shards - 1):
-                    num_elements = partitioned_params[partition_id].numel(
-                    ) - shard_id * shard_size
-
-                shard_list = []
-                for dp_id in range(dp_world_size):
-                    curr_shard = partitioned_params[dp_id].narrow(
-                        0,
-                        shard_id * shard_size,
-                        num_elements).detach()
-                    shard_list.append(curr_shard)
-
-                dist.all_gather(shard_list,
-                                shard_list[partition_id],
-                                group=self.dp_process_group)
-        self.stop_timers([OPTIMIZER_ALLGATHER])
-
-        # TODO: we probably don't need this? just to be safe
-        for i in range(len(norm_groups)):
-            updated_params = _unflatten_dense_tensors(self.fp16_groups_flat[i],
-                                                      self.fp16_groups[i])
-            for p, q in zip(self.fp16_groups[i], updated_params):
-                p.data = q.data
-
-        self.log_timers(timer_names)
-        see_memory_usage('After zero_optimizer step')
-
-        return
-
-    def unscale_and_clip_grads(self, grad_groups_flat, norm_groups):
-        total_norm = 0.0
-        for norm in norm_groups:
-            total_norm += norm**2.0
-        total_norm = math.sqrt(total_norm)
-
-        # compute combined scale factor for this group
-        combined_scale = self.loss_scale
-        if self.clip_grad > 0.:
-            # norm is in fact norm*scale
-            clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
-
-        for grad in grad_groups_flat:
-            if isinstance(grad, list):
-                sub_partitions = grad
-                for g in sub_partitions:
-                    g.data.mul_(1. / combined_scale)
-            else:
-                grad.data.mul_(1. / combined_scale)
-
-    def _check_overflow(self, partition_gradients=True):
-        self.overflow = self.has_overflow(partition_gradients)
-
-    # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
-        for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
-
-    def has_overflow_partitioned_grads_serial(self):
-        for i in range(len(self.fp16_groups)):
-            for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
-
-    def has_overflow(self, partition_gradients=True):
-        if partition_gradients:
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
-            )
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-            torch.distributed.all_reduce(overflow_gpu,
-                                         op=torch.distributed.ReduceOp.MAX,
-                                         group=self.dp_process_group)
-
-        else:
-            params = []
-            for group in self.fp16_groups:
-                for param in group:
-                    params.append(param)
-
-            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = torch.cuda.ByteTensor([overflow])
-
-        # Since each model parallel GPU carries only part of the model,
-        # make sure overflow flag is synced across all the model parallel GPUs
-        self._model_parallel_all_reduce(tensor=overflow_gpu,
-                                        op=torch.distributed.ReduceOp.MAX)
-
-        overflow = overflow_gpu[0].item()
-        return bool(overflow)
-
-    # `x` is a torch.Tensor
-    @staticmethod
-    def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def backward(self, loss, retain_graph=False):
-        """
-        :attr:`backward` performs the following steps:
-
-        1. fp32_loss = loss.float()
-        2. scaled_loss = fp32_loss*loss_scale
-        3. scaled_loss.backward(), which accumulates scaled gradients into the ``.grad`` attributes of the model's fp16 leaves
-        """
-        self.micro_step_id += 1
-        if self.cpu_offload:
-            torch.cuda.current_stream().wait_stream(self.migration_stream)
-
-        #TODO: we need to revist this and remove the magic 4.5x multiplier here
-        if self.contiguous_gradients:
-            self.ipg_buffer = []
-            buf_0 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                dtype=torch.half,
-                                device=torch.cuda.current_device())
-            self.ipg_buffer.append(buf_0)
-
-            # Use double buffers to avoid data access conflict when overlap_comm is enabled.
-            if self.overlap_comm:
-                buf_1 = torch.empty(int(self.reduce_bucket_size * 4.5),
-                                    dtype=torch.half,
-                                    device=torch.cuda.current_device())
-                self.ipg_buffer.append(buf_1)
-            self.ipg_index = 0
-
-        self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
-
-    def check_overflow(self, partition_gradients=True):
-        self._check_overflow(partition_gradients)
-
-    def _update_scale(self, has_overflow=False):
-        self.loss_scaler.update_scale(has_overflow)
-
-    # Promote state so it can be retrieved or set via "fp16_optimizer_instance.state"
-    def _get_state(self):
-        return self.optimizer.state
-
-    def _set_state(self, value):
-        self.optimizer.state = value
-
-    state = property(_get_state, _set_state)
-
-    # Promote param_groups so it can be retrieved or set via "fp16_optimizer_instance.param_groups"
-    # (for example, to adjust the learning rate)
-    def _get_param_groups(self):
-        return self.optimizer.param_groups
-
-    def _set_param_groups(self, value):
-        self.optimizer.param_groups = value
-
-    param_groups = property(_get_param_groups, _set_param_groups)
-
-    # Promote loss scale so it can be retrieved or set via "fp16_optimizer_instance.loss_scale"
-    def _get_loss_scale(self):
-        return self.loss_scaler.loss_scale
-
-    def _set_loss_scale(self, value):
-        self.loss_scaler.cur_scale = value
-
-    loss_scale = property(_get_loss_scale, _set_loss_scale)
-    cur_scale = property(_get_loss_scale, _set_loss_scale)
-
-    # Return group tensor after removing paddings that are added for alignment to DP world size.
-    # This method works on the assumption that each group contains a single flattened tensor.
-    def _get_groups_without_padding(self, groups_with_padding):
-        groups_without_padding = []
-        for i, group in enumerate(groups_with_padding):
-            lean_length = group.numel() - self.groups_padding[i]
-            groups_without_padding.append(group[:lean_length])
-
-        return groups_without_padding
-
-    # Return optimizer state after removing paddings that are added for alignment.
-    def _get_state_without_padding(self, state_with_padding, padding):
-        lean_state = {}
-        for key, value in state_with_padding.items():
-            if torch.is_tensor(value):
-                lean_length = value.numel() - padding
-                lean_state[key] = value[:lean_length]
-            else:
-                lean_state[key] = value
-
-        return lean_state
-
-    # Return base optimizer states.
-    # This method assumes that each param group contains a single flattened tensor.
-    def _get_base_optimizer_state(self):
-        optimizer_groups_state = []
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
-            optimizer_groups_state.append(lean_optimizer_state)
-
-        return optimizer_groups_state
-
-    def state_dict(self):
-        """
-        Returns a dict containing the current state of this :class:`FP16_Optimizer` instance.
-        This dict contains attributes of :class:`FP16_Optimizer`, as well as the state_dict
-        of the contained Pytorch optimizer.
-        Example::
-            checkpoint = {}
-            checkpoint['model'] = model.state_dict()
-            checkpoint['optimizer'] = optimizer.state_dict()
-            torch.save(checkpoint, "saved.pth")
-        """
-        state_dict = {}
-        state_dict['loss_scaler'] = self.loss_scaler
-        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
-        state_dict['overflow'] = self.overflow
-        state_dict['base_optimizer_state'] = self._get_base_optimizer_state()
-
-        state_dict['zero_stage'] = ZERO_OPTIMIZATION_GRADIENTS
-        state_dict['partition_count'] = self.partition_count
-
-        # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
-        state_dict['single_partition_of_fp32_groups'] = fp32_groups_without_padding
-
-        #        if self.cpu_offload:
-        #            state_dict_tmp = async_copy_to(state_dict,
-        #                                           'cpu',
-        #                                           torch.cuda.current_stream())
-        #            state_dict = state_dict_tmp
-
-        return state_dict
-
-    # Restore base optimizer fp32 weights from checkpoint by:
-    # 1) Merging fp32 weights from checkpoints of all partitions
-    # 2) Extracting fp32 weights for current partition from merged weights
-    # 3) Using extracted weights to update base optimizer weights directly.
-    def _restore_from_fp32_weights(self, all_state_dict):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        merged_single_partition_of_fp32_groups = []
-        for i in range(len(self.single_partition_of_fp32_groups)):
-            merged_partitions = [
-                sd['single_partition_of_fp32_groups'][i] for sd in all_state_dict
-            ]
-            flat_merged_partitions = flatten_dense_tensors_aligned(
-                merged_partitions,
-                dist.get_world_size(group=self.dp_process_group))
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-            merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
-
-        for current, saved in zip(self.single_partition_of_fp32_groups, merged_single_partition_of_fp32_groups):
-            current.data.copy_(saved.data)
-
-    # Restore base optimizer fp32 weights from ZeRO fp16 weights
-    def _restore_from_fp16_weights(self):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        for fp16_partitions, fp32_partition in zip(self.parallel_partitioned_fp16_groups, self.single_partition_of_fp32_groups):
-            fp32_partition.data.copy_(fp16_partitions[partition_id].data)
-
-    # Refresh the fp32 master params from the fp16 copies.
-    def refresh_fp32_params(self):
-        self._restore_from_fp16_weights()
-
-    # Extract optimizer state for current partition from merged states of all partitions
-    def _partition_base_optimizer_state(self, state_key, all_partition_states):
-        partition_id = dist.get_rank(group=self.dp_process_group)
-        alignment = dist.get_world_size(group=self.dp_process_group)
-        if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = flatten_dense_tensors_aligned(
-                all_partition_states,
-                alignment)
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions)
-            return dp_partitions[partition_id]
-        else:
-            # Assume non-tensor states are not partitioned and equal across ranks, so return first one
-            return all_partition_states[0]
-
-    # Restore base optimizer state from checkpoint by
-    # 1) Merging optimizer state from checkpoints of all partitions
-    # 2) Extracting optimizer state for current partition from the merged state
-    # 3) Using the extracted value to directly update the base optimizer.
-    def _restore_base_optimizer_state(self, all_state_dict):
-        base_optimizer_group_states = []
-        for i in range(len(self.optimizer.param_groups)):
-            partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
-            for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states)
-            base_optimizer_group_states.append(partition_states)
-
-        for i, group in enumerate(self.optimizer.param_groups):
-            p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                if torch.is_tensor(self.optimizer.state[p][key]):
-                    self.optimizer.state[p][key].data.copy_(saved.data)
-                else:
-                    self.optimizer.state[p][key] = saved
-
-    def load_state_dict(self,
-                        state_dict_list,
-                        load_optimizer_states=True,
-                        load_from_fp32_weights=False):
-        r"""Loading ZeRO checkpoint
-
-        Arguments:
-            state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
-                Note that the number of saved partitions may differ from number of loading partitions to support
-                changing GPU count, specifically DP world size, between saving and loading checkpoints.
-            load_optimizer_states: Boolean indicating whether or not to load base optimizer states
-            load_from_fp32_weights: Boolean indicating whether to initialize fp32 master weights from fp32
-            copies in checkpoints (no precision loss) or from model's fp16 copies (with precision loss).
-        """
-        """
-        Loads a state_dict created by an earlier call to state_dict().
-        If ``fp16_optimizer_instance`` was constructed from some ``init_optimizer``,
-        whose parameters in turn came from ``model``, it is expected that the user
-        will call ``model.load_state_dict()`` before
-        ``fp16_optimizer_instance.load_state_dict()`` is called.
-        Example::
-            model = torch.nn.Linear(D_in, D_out).cuda().half()
-            optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-            optimizer = FP16_Optimizer(optimizer, static_loss_scale = 128.0)
-            ...
-            checkpoint = torch.load("saved.pth")
-            model.load_state_dict(checkpoint['model'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-        """
-        # I think it should actually be ok to reload the optimizer before the model.
-        self.loss_scaler = state_dict_list[0]['loss_scaler']
-        self.dynamic_loss_scale = state_dict_list[0]['dynamic_loss_scale']
-        self.overflow = state_dict_list[0]['overflow']
-
-        if load_optimizer_states:
-            self._restore_base_optimizer_state(state_dict_list)
-
-        # At this point, the optimizer's references to the model's fp32 parameters are up to date.
-        # The optimizer's hyperparameters and internal buffers are also up to date.
-        # However, the fp32 master copies of the model's fp16 params stored by the optimizer are still
-        # out of date.  There are two options.
-        # 1:  Refresh the master params from the model's fp16 params.
-        # This requires less storage but incurs precision loss.
-        # 2:  Save and restore the fp32 master copies separately.
-        # We choose option 1 if changing DP degree and option 2 otherwise.
-        #
-        # Pytorch Optimizer.load_state_dict casts saved buffers (e.g. momentum) to the type and device
-        # of their associated parameters, because it's possible those buffers might not exist yet in
-        # the current optimizer instance.  In our case, as long as the current FP16_Optimizer has been
-        # constructed in the same way as the one whose state_dict we are loading, the same master params
-        # are guaranteed to exist, so we can just copy_() from the saved master params.
-
-        if load_from_fp32_weights:
-            self._restore_from_fp32_weights(state_dict_list)
-        else:
-            self._restore_from_fp16_weights()
-
-
-def _handle_overflow(cpu_sum, x, i):
-    import math
-    rank = torch.distributed.get_rank()
-    if rank == 0:
-        t_i = -1
-        for v_i, v in enumerate(x.data.contiguous().view(-1)):
-            if not math.isfinite(float(v)):
-                t_i = v_i
-                break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 9f7cb384a74afad70a2d5088e4368fe673ecdf78..4f873327b9dcdf8f11756559dff33cb800347be7 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1,7 +1,7 @@
-"""
-"Copyright 2020 The Microsoft DeepSpeed Team.
-Licensed under the MIT license.
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import sys
 import gc
@@ -33,7 +33,7 @@ pg_correctness_test = False
 def print_rank_0(message, debug=False, force=False):
     rank = dist.get_rank()
     if rank == 0 and (debug or force):
-        print(message)
+        logger.info(message)
     # other variations
     # - print for all ranks w/o interleaving
     # printflock(f"[{rank}] {message}")
@@ -73,6 +73,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     For usage examples, refer to TODO: DeepSpeed Tutorial
 
     """
+
     def __init__(self,
                  module,
                  init_optimizer,
@@ -106,8 +107,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
-        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}",
-                     force=False)
+        print_rank_0(f"initialized {__class__.__name__} with args: {locals()}", force=False)
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
@@ -147,18 +147,17 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         self.params_in_nvme_and_cpu = False
         self.max_params_in_cpu = 0
 
-        self.parameter_offload = DeepSpeedZeRoOffload(
-            module=module,
-            timers=timers,
-            ds_config=ds_config,
-            overlap_comm=overlap_comm,
-            prefetch_bucket_size=prefetch_bucket_size,
-            max_reuse_distance=max_reuse_distance,
-            max_live_parameters=max_live_parameters,
-            param_persistence_threshold=param_persistence_threshold,
-            model_persistence_threshold=model_persistence_threshold,
-            offload_param_config=offload_optimizer_config,
-            mpu=mpu)
+        self.parameter_offload = self.initialize_ds_offload(module=module,
+                                                            timers=timers,
+                                                            ds_config=ds_config,
+                                                            overlap_comm=overlap_comm,
+                                                            prefetch_bucket_size=prefetch_bucket_size,
+                                                            max_reuse_distance=max_reuse_distance,
+                                                            max_live_parameters=max_live_parameters,
+                                                            param_persistence_threshold=param_persistence_threshold,
+                                                            model_persistence_threshold=model_persistence_threshold,
+                                                            offload_optimizer_config=offload_optimizer_config,
+                                                            mpu=mpu)
 
         self.persistent_parameters = self.parameter_offload.persistent_parameters
         self._configure_offloading(offload_optimizer_config, offload_param_config)
@@ -166,24 +165,21 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         self.module = module
         self.elastic_checkpoint = elastic_checkpoint
 
-        self.__inf_or_nan_tracker: Tensor = torch.zeros(
-            1,
-            dtype=torch.bool,
-            device=get_accelerator().current_device_name(),
-            requires_grad=False)
+        self.inf_or_nan_tracker: Tensor = torch.zeros(1,
+                                                      dtype=torch.bool,
+                                                      device=get_accelerator().current_device_name(),
+                                                      requires_grad=False)
 
-        self.deepspeed_adam_offload = (self.offload_optimizer
-                                       and type(init_optimizer) == DeepSpeedCPUAdam)
+        self.deepspeed_adam_offload = (self.offload_optimizer and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = get_accelerator().current_device_name(
-        ) if not self.offload_optimizer else OffloadDeviceEnum.cpu
+        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
         ### streams used for overlapping computation with communication
-        self.__reduce_and_partition_stream = get_accelerator().Stream(
-        ) if overlap_comm else get_accelerator().default_stream()
+        self.reduce_and_partition_stream = get_accelerator().Stream() if overlap_comm else get_accelerator(
+        ).default_stream()
 
         ############################################################################
 
-        self.__n_caching_allocator_flushes = 0
+        self.n_caching_allocator_flushes = 0
 
         #-------------Stage 3 Setup-------------------#
 
@@ -212,7 +208,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         self.reduce_bucket_size = int(reduce_bucket_size)
 
         if self.reduce_scatter:
-            assert self.communication_data_type in (torch.float16, torch.bfloat16, torch.float32), f"ZeRO-3 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+            valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+            assert self.communication_data_type in valid_reduce_scatter_dtypes, f"ZeRO-3 supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
             assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-3 with reduce scatter enabled"
             assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-3 with reduce scatter enabled"
 
@@ -258,19 +255,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         see_memory_usage("Before creating fp16 partitions", force=True)
         self._create_fp16_partitions_with_defragmentation(self.trainable_param_groups)
         num_fp16_subgroups = len(self.fp16_partitioned_groups_flat)
-        see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}",
-                         force=True)
+        see_memory_usage(f"After creating fp16 partitions: {num_fp16_subgroups}", force=True)
 
         # Optimizer tensor swapping
         if self.swap_optimizer:
             self._configure_tensor_swapping(offload_optimizer_config, aio_config)
 
-        self.__params_in_ipg_bucket: List[Parameter] = []
+        self.params_in_ipg_bucket = []
         self.is_gradient_accumulation_boundary: bool = True
 
-        self.__param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
+        self.param_reduce_events: Deque[get_accelerator().Event] = collections.deque()
         # TODO. make this configurable via JSON
-        self.__max_param_reduce_events: int = 2
+        self.max_param_reduce_events: int = 2
 
         self.param_dict = {}
 
@@ -300,14 +296,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         #Largest partitioned param
         largest_partitioned_param_numel = max([
-            max([
-                max(tensor.numel(),
-                    tensor.ds_numel) for tensor in fp16_partitioned_group
-            ]) for fp16_partitioned_group in self.fp16_partitioned_groups
+            max([max(tensor.numel(), tensor.ds_numel) for tensor in fp16_partitioned_group])
+            for fp16_partitioned_group in self.fp16_partitioned_groups
         ])
-        print_rank_0(
-            f'Largest partitioned param numel = {largest_partitioned_param_numel}',
-            force=False)
+        print_rank_0(f'Largest partitioned param numel = {largest_partitioned_param_numel}', force=False)
 
         self._setup_for_real_optimizer()
         self.grad_position = {}
@@ -348,12 +340,36 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     def destroy(self):
         self.parameter_offload.destroy()
 
+    def initialize_ds_offload(
+        self,
+        module,
+        timers,
+        ds_config,
+        overlap_comm,
+        prefetch_bucket_size,
+        max_reuse_distance,
+        max_live_parameters,
+        param_persistence_threshold,
+        model_persistence_threshold,
+        offload_optimizer_config,
+        mpu,
+    ):
+        return DeepSpeedZeRoOffload(module=module,
+                                    timers=timers,
+                                    ds_config=ds_config,
+                                    overlap_comm=overlap_comm,
+                                    prefetch_bucket_size=prefetch_bucket_size,
+                                    max_reuse_distance=max_reuse_distance,
+                                    max_live_parameters=max_live_parameters,
+                                    param_persistence_threshold=param_persistence_threshold,
+                                    model_persistence_threshold=model_persistence_threshold,
+                                    offload_param_config=offload_optimizer_config,
+                                    mpu=mpu)
+
     def _get_trainable_parameter_groups(self):
         param_groups = []
         for param_group in self.optimizer.param_groups:
-            trainable_params = {
-                "params": [p for p in param_group["params"] if p.requires_grad]
-            }
+            trainable_params = {"params": [p for p in param_group["params"] if p.requires_grad]}
             param_groups.append(trainable_params)
         return param_groups
 
@@ -377,31 +393,25 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         # IPG
         if self.contiguous_gradients:
-            self.__ipg_bucket_flat_buffer: Tensor = torch.empty(
-                self.reduce_bucket_size,
-                dtype=self.dtype,
-                device=get_accelerator().current_device_name())
+            self.__ipg_bucket_flat_buffer: Tensor = torch.empty(self.reduce_bucket_size,
+                                                                dtype=self.dtype,
+                                                                device=get_accelerator().current_device_name())
 
         grad_partitions_flat_buffer = None
         self.__param_id_to_grad_partition: Dict[int, Tensor] = {}
 
         all_params = list(itertools.chain.from_iterable(self.fp16_groups))
 
-        grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel()
-                                                              for p in all_params),
+        grad_partitions_flat_buffer: Tensor = torch.zeros(sum(p.partition_numel() for p in all_params),
                                                           dtype=self.dtype,
                                                           device=self.device)
         if self.offload_optimizer_pin_memory:
-            grad_partitions_flat_buffer = get_accelerator().pin_memory(
-                grad_partitions_flat_buffer)
+            grad_partitions_flat_buffer = get_accelerator().pin_memory(grad_partitions_flat_buffer)
 
         offset = 0
         for param in all_params:
-            self.__param_id_to_grad_partition[
-                param.ds_id] = grad_partitions_flat_buffer.narrow(
-                    0,
-                    offset,
-                    param.partition_numel())
+            self.__param_id_to_grad_partition[param.ds_id] = grad_partitions_flat_buffer.narrow(
+                0, offset, param.partition_numel())
             offset += param.partition_numel()
 
     def _link_all_hp_params(self):
@@ -477,27 +487,25 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 force=False)
 
     def _configure_tensor_swapping(self, offload_optimizer_config, aio_config):
-        nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path,
-                                        'zero_stage_3')
+        nvme_swap_folder = os.path.join(offload_optimizer_config.nvme_path, 'zero_stage_3')
         os.makedirs(nvme_swap_folder, exist_ok=True)
         if dist.get_rank() == 0:
             logger.info(f'Tensor Swapping: Adding optimizer tensors')
 
         swapper_type = PipelinedOptimizerSwapper if offload_optimizer_config.pipeline else PartitionedOptimizerSwapper
 
-        self.optimizer_swapper = swapper_type(
-            swap_config=offload_optimizer_config,
-            aio_config=aio_config,
-            base_folder=nvme_swap_folder,
-            optimizer=self.optimizer,
-            largest_numel=max(self.fp16_partitioned_groups_flat_numel),
-            device=self.device,
-            dtype=torch.float32,
-            timers=self.timers)
+        self.optimizer_swapper = swapper_type(swap_config=offload_optimizer_config,
+                                              aio_config=aio_config,
+                                              base_folder=nvme_swap_folder,
+                                              optimizer=self.optimizer,
+                                              largest_numel=max(self.fp16_partitioned_groups_flat_numel),
+                                              device=self.device,
+                                              dtype=torch.float32,
+                                              timers=self.timers)
 
     @property
     def elements_in_ipg_bucket(self):
-        return sum(p.ds_numel for p in self.__params_in_ipg_bucket)
+        return sum(p.ds_numel for p in self.params_in_ipg_bucket)
 
     def _move_to_flat_buffer(self, param_list, flat_buffer, avoid_copy=False):
         '''If flat buffer is None then the parameters in the param_list are
@@ -518,8 +526,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             '''if the parameter was initialized in nvme then bring it to the destination buffer directly'''
             if src.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
-                    f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU"
-                )
+                    f"Swapping in {param.ds_id} with partition size {param.partition_numel()} permanently to CPU")
                 param.nvme_swapper.swap_into_buffer(param, dest)
                 src.data = dest.data
                 src.status = PartitionedParamStatus.AVAILABLE
@@ -544,33 +551,24 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             if self.params_in_nvme_and_cpu and \
                 aggregate_params_count + params_in_group > self.max_params_in_cpu:
 
-                flat_buffer_size = max(0,
-                                       self.max_params_in_cpu - aggregate_params_count)
+                flat_buffer_size = max(0, self.max_params_in_cpu - aggregate_params_count)
 
             aggregate_params_count += params_in_group
 
             if flat_buffer_size > 0:
-                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}",
-                             force=False)
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    get_accelerator().pin_memory(
-                        torch.empty(int(flat_buffer_size),
-                                    dtype=self.dtype)))
+                print_rank_0(f"group {j} flat buffer size {flat_buffer_size}", force=False)
+                self.param_groups_fp16_flat_cpu_memory.append(get_accelerator().pin_memory(
+                    torch.empty(int(flat_buffer_size), dtype=self.dtype)))
             else:
-                print_rank_0(
-                    f"No flat buffer size. Param group size was  {params_in_group}",
-                    force=False)
+                print_rank_0(f"No flat buffer size. Param group size was  {params_in_group}", force=False)
 
-                self.param_groups_fp16_flat_cpu_memory.append(
-                    torch.empty(1,
-                                dtype=self.dtype))
+                self.param_groups_fp16_flat_cpu_memory.append(torch.empty(1, dtype=self.dtype))
 
     def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
         dist.barrier()
 
         param_groups: List[List[Parameter]] = tuple(
-            self._create_fp16_sub_groups(param_group["params"])
-            for param_group in fp16_param_groups)
+            self._create_fp16_sub_groups(param_group["params"]) for param_group in fp16_param_groups)
 
         # bookkeeping related to param groups
         for param_group_idx, param_group in enumerate(param_groups):
@@ -579,23 +577,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
                 # record sub group and partitions
                 self.fp16_groups.append(sub_group)
-                self.fp16_partitioned_groups.append(
-                    [param.ds_tensor for param in sub_group])
+                self.fp16_partitioned_groups.append([param.ds_tensor for param in sub_group])
 
                 # record sub group -> group mapping
                 self.sub_group_to_group_id[sub_group_idx] = param_group_idx
 
                 # record total elements of parameter partitions in sub group
-                self.fp16_partitioned_groups_flat_numel.append(
-                    sum(p.partition_numel() for p in sub_group))
+                self.fp16_partitioned_groups_flat_numel.append(sum(p.partition_numel() for p in sub_group))
 
                 # record padding required to align group to world size (only applies to last rank)
                 rank_requires_padding = dist.get_rank(
-                    self.dp_process_group) == dist.get_world_size(
-                        self.dp_process_group) - 1
-                self.groups_padding.append([
-                    p.padding_size() if rank_requires_padding else 0 for p in sub_group
-                ])
+                    self.dp_process_group) == dist.get_world_size(self.dp_process_group) - 1
+                self.groups_padding.append([p.padding_size() if rank_requires_padding else 0 for p in sub_group])
 
         # move parameters to flattened buffer
         if not self.offload_param:  # partitioned params remain in GPU during training
@@ -611,10 +604,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             offset = 0
             for sub_group in self.fp16_groups:
                 sub_group_numel = sum(param.partition_numel() for param in sub_group)
-                self.fp16_partitioned_groups_flat.append(
-                    device_buffer.narrow(0,
-                                         offset,
-                                         sub_group_numel))
+                self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel))
                 offset += sub_group_numel
         else:  # partitioned params offloaded to CPU when not in use
             # create a flat CPU memory allocation for each param group
@@ -627,19 +617,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                     #Flat buffer may not be available for parameters that reside in NVME
                     if not self.params_in_nvme_and_cpu or flat_offset + total_elements <= self.param_groups_fp16_flat_cpu_memory[
                             param_group_idx].numel():
-                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[
-                            param_group_idx].narrow(0,
-                                                    flat_offset,
-                                                    total_elements)
+                        fp16_partitioned_group_flat = self.param_groups_fp16_flat_cpu_memory[param_group_idx].narrow(
+                            0, flat_offset, total_elements)
                         print_rank_0(
                             f"Creating a flat buffer for subgroup {i} requiring {total_elements} elements, and cumulative CPU elements {flat_offset + total_elements}",
                             force=False)
 
                     elif self.params_in_nvme_and_cpu:
                         fp16_partitioned_group_flat = None
-                        print_rank_0(
-                            f"No flat buffer for sub group {i} of {total_elements} elements",
-                            force=False)
+                        print_rank_0(f"No flat buffer for sub group {i} of {total_elements} elements", force=False)
                     else:
                         assert False, "Either params are in nvme, or they are in CPU memory. This code path should not be triggered. Please see you max_params_in_cpu and params_in_nvme configs"
 
@@ -652,9 +638,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         # if necessary, create a pinned memory buffer to be used for swapping out
         # params to NVME after optimizer step
-        should_create_fp16_flat_reuse_buffer = any(
-            flattened_partition_group is None
-            for flattened_partition_group in self.fp16_partitioned_groups_flat)
+        should_create_fp16_flat_reuse_buffer = any(flattened_partition_group is None
+                                                   for flattened_partition_group in self.fp16_partitioned_groups_flat)
         if should_create_fp16_flat_reuse_buffer:
             max_partition_numel, largest_partition_numel = 0, None
             for sub_group in self.fp16_groups:
@@ -664,15 +649,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                     max_partition_numel = total_elements
 
             assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
-            self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(
-                largest_partition_numel)
+            self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
 
     def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
         offset = 0
-        elements_in_sub_group = sum(
-            [t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
+        elements_in_sub_group = sum([t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
         assert (flat_buffer.numel() == elements_in_sub_group)
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id]):
             dest = flat_buffer.narrow(0, offset, partitioned_param.ds_numel)
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 print_rank_0(
@@ -687,9 +671,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             offset += partitioned_param.ds_numel
 
     def _create_next_swappable_fp32_groups(self):
-        reverse_order_indices = [
-            i for i in range(len(self.fp32_partitioned_groups_flat))
-        ]
+        reverse_order_indices = [i for i in range(len(self.fp32_partitioned_groups_flat))]
         reverse_order_indices.reverse()
 
         next_group = None
@@ -702,16 +684,13 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
     def _get_sub_group_partitions(self, sub_group_id):
         sub_group_partitions = []
-        for param, partitioned_param in zip(self.fp16_groups[sub_group_id], self.fp16_partitioned_groups[sub_group_id]):
+        for param, partitioned_param in zip(self.fp16_groups[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id]):
             if partitioned_param.status == PartitionedParamStatus.NOT_AVAILABLE:
                 swap_path = param.nvme_swapper.get_path(param, True)
-                sub_group_partitions.append((partitioned_param,
-                                             param.partition_numel(),
-                                             swap_path))
+                sub_group_partitions.append((partitioned_param, param.partition_numel(), swap_path))
             else:
-                sub_group_partitions.append((partitioned_param,
-                                             partitioned_param.ds_numel,
-                                             None))
+                sub_group_partitions.append((partitioned_param, partitioned_param.ds_numel, None))
 
         return sub_group_partitions
 
@@ -749,60 +728,47 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                         sub_group_partitions = self._get_sub_group_partitions(i)
                         nvme_fp16_partitions_info.append(sub_group_partitions)
                         nvme_fp16_num_elems.append(num_elements)
-                        nvme_fp32_dest_tensors.append(
-                            self.fp32_partitioned_groups_flat[i])
+                        nvme_fp32_dest_tensors.append(self.fp32_partitioned_groups_flat[i])
                     else:
-                        unpinned_fp32_buffer = torch.empty(num_elements,
-                                                           device=self.device,
-                                                           dtype=torch.float)
+                        unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float)
                         self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
-                        self.optimizer_swapper.initialize_parameters(
-                            parameters=[self.fp32_partitioned_groups_flat[i]],
-                            src_tensors=[unpinned_fp32_buffer])
+                        self.optimizer_swapper.initialize_parameters(parameters=[self.fp32_partitioned_groups_flat[i]],
+                                                                     src_tensors=[unpinned_fp32_buffer])
                 else:
                     num_swap_from_cpu_partitions += 1
                     swap_from_cpu_memory_usage += (fp32_element_size * num_elements)
                     swappable_fp32_tensors.append(self.fp32_partitioned_groups_flat[i])
-                    swappable_fp16_src_tensors.append(
-                        self.fp16_partitioned_groups_flat[i])
+                    swappable_fp16_src_tensors.append(self.fp16_partitioned_groups_flat[i])
             else:
                 cpu_memory_usage += (fp32_element_size * num_elements)
                 cpu_memory_sub_groups += 1
 
                 if self.params_in_nvme_and_cpu and tensor is None:
-                    unpinned_fp32_buffer = torch.empty(num_elements,
-                                                       device=self.device,
-                                                       dtype=torch.float)
+                    unpinned_fp32_buffer = torch.empty(num_elements, device=self.device, dtype=torch.float)
                     self._swap_in_sub_group_to_flat_buffer(unpinned_fp32_buffer, i)
                     self.fp32_partitioned_groups_flat.append(unpinned_fp32_buffer)
                 else:
-                    self.fp32_partitioned_groups_flat.append(
-                        self.fp16_partitioned_groups_flat[i].to(
-                            self.device).clone().float().detach())
+                    self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to(
+                        self.device).clone().float().detach())
 
-            self.fp32_partitioned_groups_flat[
-                i].requires_grad = True  # keep this in case internal optimizer uses it
+            self.fp32_partitioned_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
 
         if len(swappable_fp32_tensors) > 0:
-            self.optimizer_swapper.initialize_parameters(
-                parameters=swappable_fp32_tensors,
-                src_tensors=swappable_fp16_src_tensors)
+            self.optimizer_swapper.initialize_parameters(parameters=swappable_fp32_tensors,
+                                                         src_tensors=swappable_fp16_src_tensors)
 
         if len(nvme_fp32_dest_tensors) > 0:
-            fp16_pinned_buffers = self.fp16_groups[0][
-                0].nvme_swapper.reserve_available_buffers()
+            fp16_pinned_buffers = self.fp16_groups[0][0].nvme_swapper.reserve_available_buffers()
             assert len(fp16_pinned_buffers) > 0
-            self.optimizer_swapper.initialize_from_swapped_fp16_params(
-                fp16_partitions_info=nvme_fp16_partitions_info,
-                fp16_num_elems=nvme_fp16_num_elems,
-                fp16_pinned_buffers=fp16_pinned_buffers,
-                fp32_parameters=nvme_fp32_dest_tensors)
+            self.optimizer_swapper.initialize_from_swapped_fp16_params(fp16_partitions_info=nvme_fp16_partitions_info,
+                                                                       fp16_num_elems=nvme_fp16_num_elems,
+                                                                       fp16_pinned_buffers=fp16_pinned_buffers,
+                                                                       fp32_parameters=nvme_fp32_dest_tensors)
             self.fp16_groups[0][0].nvme_swapper.release_reserved_buffers()
 
         nvme_gigabytes = nvme_memory_usage / GIGA_BYTES
-        print_rank_0(
-            f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
-            force=False)
+        print_rank_0(f'Swappable FP32 Partitions: count={num_swappable_partitions} size={nvme_gigabytes:5.2f} GB',
+                     force=False)
         if self.params_in_nvme_and_cpu:
             print_rank_0(
                 f'Swap from NVMe Partitions: count = {num_swap_from_nvme_partitions}, size = {swap_from_nvme_memory_usage/GIGA_BYTES:5.2f}GB',
@@ -812,9 +778,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 force=False)
 
         cpu_memory_gigabytes = cpu_memory_usage / GIGA_BYTES
-        print_rank_0(
-            f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
-            force=False)
+        print_rank_0(f'In-Memory FP32 Partitions: count={cpu_memory_sub_groups} size={cpu_memory_gigabytes:5.2f} GB',
+                     force=False)
 
         # Clear for on-the-fly population before the optimizer step
         for param_group in self.optimizer.param_groups:
@@ -836,8 +801,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             sub_group.append(param)
             local_sub_group_size += param.partition_numel()
 
-            if local_sub_group_size >= sub_group_size or id(param) == id(
-                    params_group[-1]):
+            if local_sub_group_size >= sub_group_size or id(param) == id(params_group[-1]):
 
                 sub_groups.append(sub_group)
 
@@ -862,9 +826,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         if not self.swap_optimizer:
             return False
 
-        return self.optimizer_swapper.swappable_tensor(
-            None,
-            numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
+        return self.optimizer_swapper.swappable_tensor(None,
+                                                       numel=self.fp16_partitioned_groups_flat_numel[sub_group_id])
 
     def _partitioned_params_swap_out(self, i):
         offset = 0
@@ -884,22 +847,22 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             offset += partitioned_param.ds_numel
 
         if len(swap_fp16_params):
-            swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(
-                dst_fp16_params=swap_fp16_params,
-                src_fp32_params=swap_fp32_params)
+            swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params,
+                                                                         src_fp32_params=swap_fp32_params)
 
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
-        largest_numel = max(
-            [sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
+        largest_numel = max([sum([p.ds_numel for p in psg]) for psg in self.fp16_partitioned_groups])
         gradient_dtype = self.fp32_partitioned_groups_flat[0].dtype
-        gradient_buffer = torch.zeros(int(largest_numel),
-                                      dtype=gradient_dtype,
-                                      device=self.device)
+        gradient_buffer = torch.zeros(int(largest_numel), dtype=gradient_dtype, device=self.device)
 
         timer_names = set()
 
+        # State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
+        # which do lazy initialization of the state at the first call to step.
+        is_adagrad = isinstance(self.optimizer, torch.optim.Adagrad)
+
         if self.swap_optimizer:
             self.optimizer_swapper.init_timers()
 
@@ -921,21 +884,17 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 self._optimizer_states_and_gradient_swap_in(i, timer_names)
 
             if self.offload_optimizer and not swappable_optimizer_subgroup:
-                subgroup_gradient_buffer = torch.zeros(num_elements,
-                                                       dtype=gradient_dtype,
-                                                       device=self.device)
+                subgroup_gradient_buffer = torch.zeros(num_elements, dtype=gradient_dtype, device=self.device)
                 if self.offload_optimizer_pin_memory:
-                    subgroup_gradient_buffer = get_accelerator().pin_memory(
-                        subgroup_gradient_buffer)
+                    subgroup_gradient_buffer = get_accelerator().pin_memory(subgroup_gradient_buffer)
 
                 self.fp32_partitioned_groups_flat[i].grad = subgroup_gradient_buffer
             else:
-                self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(
-                    0,
-                    0,
-                    num_elements)
+                self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(0, 0, num_elements)
 
-            self._optimizer_step(i)
+            # Initialize the optimizer states with the flattended fp32 partition.
+            if not is_adagrad:
+                self._optimizer_step(i)
 
             if swappable_param_subgroup:
                 self._partitioned_params_swap_out(i)
@@ -947,6 +906,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 f'[End] Initialize optimizer states {i} / {num_subgroups} subgroups, num_elems: {num_elements}, swappable opt/param:{swappable_optimizer_subgroup}/{swappable_param_subgroup}',
                 force=False)
 
+        # Initialize the optimizer states with the flattended fp32 partition.
+        if is_adagrad:
+            self.optimizer = torch.optim.Adagrad(self.fp32_partitioned_groups_flat, **self.optimizer.defaults)
+
         self.stop_timers([INIT_OPTIMIZER_TIMER])
         self.log_timers(timer_names)
 
@@ -992,11 +955,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 self.grad_start_offset[i][partition_id] = {}
                 self.initialize_gradient_partition(i, param_group, partition_id)
                 self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
+                self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index(
+                    i, param_group, partition_id)
 
     @instrument_w_nvtx
     def independent_gradient_partition_epilogue(self):
@@ -1004,7 +964,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         self.__reduce_and_partition_ipg_grads()
         self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
 
-        self.__reduce_and_partition_stream.synchronize()
+        self.reduce_and_partition_stream.synchronize()
 
         # if dist.get_rank() == 0:
         #    logger.info("Params already reduced %s", self.params_already_reduced)
@@ -1017,8 +977,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             for i, sub_group in enumerate(self.fp16_groups):
                 self.averaged_gradients[i] = [
                     self.__param_id_to_grad_partition[param.ds_id]
-                    if param.requires_grad else torch.zeros_like(param.ds_tensor)
-                    for param in sub_group
+                    if param.requires_grad else torch.zeros_like(param.ds_tensor) for param in sub_group
                 ]
                 # self.averaged_gradients[i] = self.get_flat_partition(
                 #     self.fp16_groups[i],
@@ -1087,12 +1046,12 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         # garbage data and `self.average_tensor()` will crash because its params_to_reduce will be
         # empty, while reduction_list will have that garbage data.
         if self.elements_in_ipg_bucket > 0 and self.elements_in_ipg_bucket + param.ds_numel > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.ds_numel)
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.ds_numel)
 
             self.__reduce_and_partition_ipg_grads()
 
         param_id = self.get_param_id(param)
+
         assert self.params_already_reduced[param_id] == False, \
             f"The parameter {param_id} has already been reduced. \
             Gradient computed twice for this partition. \
@@ -1103,59 +1062,52 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     @instrument_w_nvtx
     @torch.no_grad()
     def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
-        self.__reduce_and_partition_stream.wait_stream(
-            get_accelerator().default_stream())
+        self.reduce_and_partition_stream.wait_stream(get_accelerator().default_stream())
 
-        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel(
-        ) < self.reduce_bucket_size:
+        if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() < self.reduce_bucket_size:
             # move the gradient to a contiguous buffer
-            with get_accelerator().stream(self.__reduce_and_partition_stream):
+            with get_accelerator().stream(self.reduce_and_partition_stream):
                 # move the parameter's gradient to the contiguous flat buffer
-                new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(
-                    0,
-                    self.elements_in_ipg_bucket,
-                    param.grad.numel()).view_as(param.grad)
+                new_grad_tensor = self.__ipg_bucket_flat_buffer.narrow(0, self.elements_in_ipg_bucket,
+                                                                       param.grad.numel()).view_as(param.grad)
                 new_grad_tensor.copy_(param.grad, non_blocking=True)
                 param.grad.record_stream(get_accelerator().current_stream())
                 param.grad.data = new_grad_tensor
 
-        self.__params_in_ipg_bucket.append(param)
+        self.params_in_ipg_bucket.append(param)
 
     @instrument_w_nvtx
     @torch.no_grad()
     def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
-        if not self.__params_in_ipg_bucket:
+        if not self.params_in_ipg_bucket:
             return
 
-        for param in self.__params_in_ipg_bucket:
+        for param in self.params_in_ipg_bucket:
             if param.grad.numel() != param.ds_numel:
-                raise RuntimeError(
-                    f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter "
-                    f"gradients whose size is not same as the params")
+                raise RuntimeError(f"{param.grad.numel()} != {param.ds_numel} Cannot reduce scatter "
+                                   f"gradients whose size is not same as the params")
 
-        self.__params_in_ipg_bucket.sort(key=lambda p: p.ds_id)
+        self.params_in_ipg_bucket.sort(key=lambda p: p.ds_id)
 
-        assert len(set(p.ds_id for p in self.__params_in_ipg_bucket)) == len(
-            self.__params_in_ipg_bucket)
+        assert len(set(p.ds_id for p in self.params_in_ipg_bucket)) == len(self.params_in_ipg_bucket)
 
-        while self.__param_reduce_events and self.__param_reduce_events[0].query():
-            self.__param_reduce_events.popleft()
-        if len(self.__param_reduce_events) > self.__max_param_reduce_events:
-            self.__param_reduce_events.popleft().synchronize()
+        while self.param_reduce_events and self.param_reduce_events[0].query():
+            self.param_reduce_events.popleft()
+        if len(self.param_reduce_events) > self.max_param_reduce_events:
+            self.param_reduce_events.popleft().synchronize()
 
-        with get_accelerator().stream(self.__reduce_and_partition_stream):
+        with get_accelerator().stream(self.reduce_and_partition_stream):
             if safe_mode:
-                assert_ints_same_as_other_ranks(
-                    [p.ds_id for p in self.__params_in_ipg_bucket])
+                assert_ints_same_as_other_ranks([p.ds_id for p in self.params_in_ipg_bucket])
 
-            grad_partitions = self.__avg_scatter_grads(self.__params_in_ipg_bucket)
-            self.__partition_grads(self.__params_in_ipg_bucket, grad_partitions)
+            grad_partitions = self.__avg_scatter_grads(self.params_in_ipg_bucket)
+            self.partition_grads(self.params_in_ipg_bucket, grad_partitions)
 
-            self.__params_in_ipg_bucket.clear()
+            self.params_in_ipg_bucket.clear()
 
             event = get_accelerator().Event()
             event.record()
-            self.__param_reduce_events.append(event)
+            self.param_reduce_events.append(event)
 
     @instrument_w_nvtx
     def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]:
@@ -1163,28 +1115,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         full_grads_for_rank = [p.grad for p in params_to_reduce]
         if self.communication_data_type != self.dtype:
-            full_grads_for_rank = [
-                g.to(self.communication_data_type) for g in full_grads_for_rank
-            ]
+            full_grads_for_rank = [g.to(self.communication_data_type) for g in full_grads_for_rank]
 
         if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
-            full_grads_for_rank = [
-                g.div(self.gradient_predivide_factor) for g in full_grads_for_rank
-            ]
+            full_grads_for_rank = [g.div(self.gradient_predivide_factor) for g in full_grads_for_rank]
 
-        grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank,
-                                                            self.dp_process_group)
+        grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group)
 
-        if self.postscale_gradients and self.gradient_predivide_factor != dist.get_world_size(
-                self.dp_process_group):
-            grad_partitions_for_rank = [
-                g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank
-            ]
+        if self.postscale_gradients and self.gradient_predivide_factor != dist.get_world_size(self.dp_process_group):
+            grad_partitions_for_rank = [g.mul(self.gradient_predivide_factor) for g in grad_partitions_for_rank]
 
         if self.communication_data_type != self.dtype:
-            grad_partitions_for_rank = [
-                g.to(self.dtype) for g in grad_partitions_for_rank
-            ]
+            grad_partitions_for_rank = [g.to(self.dtype) for g in grad_partitions_for_rank]
 
         return grad_partitions_for_rank
 
@@ -1195,11 +1137,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 param_id = self.get_param_id(param)
                 num_elements = param.partition_numel()
 
-                self.grad_position[param_id] = [
-                    int(i),
-                    int(current_offset),
-                    int(num_elements)
-                ]
+                self.grad_position[param_id] = [int(i), int(current_offset), int(num_elements)]
                 #print(f"param id {param_id} i:{i}, ds_tensor {num_elements} numel {param.numel()}")
                 current_offset += num_elements
         see_memory_usage(f"After Set Grad positions", force=False)
@@ -1240,40 +1178,33 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
 
-        dist.all_reduce(total_norm_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=self.dp_process_group)
+        dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
 
     @instrument_w_nvtx
-    def __partition_grads(self,
-                          params_to_release: List[Parameter],
-                          grad_partitions: List[Tensor]) -> None:
+    def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
         offload_fp32_gradients = {}
         offload_fp32_offsets = {}
+        buffers = []
         for param, grad_partition in zip(params_to_release, grad_partitions):
 
-            contains_real_data = param.partition_numel() * dist.get_rank(
-                self.dp_process_group) < param.ds_numel
+            contains_real_data = param.partition_numel() * dist.get_rank(self.dp_process_group) < param.ds_numel
             if not contains_real_data:
                 # this grad partition is empty - don't need to do anything
                 param.grad = None
                 continue
 
             # move or accumulate gradient partition to target buffer
-            grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow(
-                0,
-                0,
-                grad_partition.numel())
+            grad_buffer = self.__param_id_to_grad_partition[param.ds_id].narrow(0, 0, grad_partition.numel())
+            buffers.append(grad_buffer)
             if self.micro_step_id == 0:  # don't accumulate
                 grad_buffer.copy_(grad_partition, non_blocking=True)
                 # ensure grad buffer is a CUDA buffer to speed up the next few
@@ -1284,30 +1215,28 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             else:
                 # if dst is CPU, copy first to src device, do the addition
                 # there, then move back to dst. adding directly to cpu is very slow
-                cuda_grad_buffer = grad_buffer.to(grad_partition.device,
-                                                  non_blocking=True)
+                cuda_grad_buffer = grad_buffer.to(grad_partition.device, non_blocking=True)
                 cuda_grad_buffer.add_(grad_partition)
                 grad_buffer.copy_(cuda_grad_buffer, non_blocking=True)
                 # ensure grad buffer is a CUDA buffer to speed up the next few
                 # operations and so it can be used asynchronously
                 grad_buffer = cuda_grad_buffer
 
-            if hasattr(self.__inf_or_nan_tracker, "logical_or_"):
-                self.__inf_or_nan_tracker.logical_or_(torch.isinf(grad_buffer).any())
-                self.__inf_or_nan_tracker.logical_or_(torch.isnan(grad_buffer).any())
+            if hasattr(self.inf_or_nan_tracker, "logical_or_"):
+                self.inf_or_nan_tracker.logical_or_(torch.isinf(grad_buffer).any())
+                self.inf_or_nan_tracker.logical_or_(torch.isnan(grad_buffer).any())
             else:
                 # logical_or_ not available in older versions of pytorch
-                self.__inf_or_nan_tracker += torch.isinf(grad_buffer).any()
-                self.__inf_or_nan_tracker += torch.isnan(grad_buffer).any()
-                self.__inf_or_nan_tracker = self.__inf_or_nan_tracker > 0
+                self.inf_or_nan_tracker += torch.isinf(grad_buffer).any()
+                self.inf_or_nan_tracker += torch.isnan(grad_buffer).any()
+                self.inf_or_nan_tracker = self.inf_or_nan_tracker > 0
 
             # offload the gradient partition if applicable
             if self.offload_optimizer:
                 i, dest_offset, _ = self.grad_position[self.get_param_id(param)]
 
                 if self.is_gradient_accumulation_boundary:
-                    self.norm_for_param_grads[self.get_param_id(
-                        param)] = self._constant_buffered_norm2(grad_buffer)
+                    self.norm_for_param_grads[self.get_param_id(param)] = self._constant_buffered_norm2(grad_buffer)
 
                     if self._swappable_optimizer_subgroup(i):
                         if not i in offload_fp32_gradients.keys():
@@ -1317,10 +1246,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                         offload_fp32_gradients[i].append(grad_buffer.float())
                         offload_fp32_offsets[i].append(dest_offset)
                     else:
-                        fp32_grad_tensor = self.fp32_partitioned_groups_flat[
-                            i].grad.narrow(0,
-                                           dest_offset,
-                                           grad_buffer.numel())
+                        fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
+                            0, dest_offset, grad_buffer.numel())
                         fp32_grad_tensor.copy_(grad_buffer)
 
             # free the gradient
@@ -1329,16 +1256,17 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         if self.offload_optimizer and self.swap_optimizer:
             for i in offload_fp32_gradients.keys():
-                self.optimizer_swapper.swap_out_gradients(
-                    parameter=self.fp32_partitioned_groups_flat[i],
-                    gradient_offsets=offload_fp32_offsets[i],
-                    gradient_tensors=offload_fp32_gradients[i])
+                self.optimizer_swapper.swap_out_gradients(parameter=self.fp32_partitioned_groups_flat[i],
+                                                          gradient_offsets=offload_fp32_offsets[i],
+                                                          gradient_tensors=offload_fp32_gradients[i])
+        return buffers
 
     def reduce_ready_partitions_and_remove_grads(self, param, i):
         #print_rank_0(f"Backward {debug_param2name_id_shape(param)}", force=True)
         self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
 
     def zero_reduced_gradients(self, partition_id, i):
+
         def are_all_related_partitions_reduced(params_id):
             for partition_id in self.param_to_partition_ids[i][params_id]:
                 if not self.is_partition_reduced[i][partition_id]:
@@ -1358,29 +1286,23 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         self.sequential_execution(print_func, message)
 
     def get_grads_to_reduce(self, i, partition_id):
+
         def get_reducible_portion(key):
             grad = self.param_dict[key].grad
             total_elements = grad.numel()
             start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
+            num_elements = min(total_elements - start,
+                               self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key])
             if not pg_correctness_test:
                 if num_elements == total_elements:
                     return grad
                 else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
+                    return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements))
             else:
                 if num_elements == total_elements:
                     return grad.clone()
                 else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
+                    return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements))
 
         grads_to_reduce = []
         for key in self.is_grad_computed[i][partition_id]:
@@ -1443,11 +1365,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
+    def allreduce_no_retain(self, bucket, numel_per_bucket=500000000, rank=None, log=None):
         small_bucket = []
         numel = 0
         for tensor in bucket:
@@ -1502,11 +1420,11 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             if (current_index >= start_index and current_index < end_index):
                 params_in_partition.append(tensor)
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
+            elif start_index > current_index and start_index < (current_index + tensor_size):
                 params_in_partition.append(tensor)
 
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
             else:
@@ -1566,9 +1484,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
@@ -1579,23 +1495,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             grad_norms = []
             for g, p in zip(gradients, params):
                 if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    grad_norms.append(
-                        g.to(get_accelerator().device_name(),
-                             non_blocking=True).double().norm(2))
+                    grad_norms.append(g.to(get_accelerator().device_name(), non_blocking=True).double().norm(2))
 
             # Sum across all model parallel GPUs.
             total_norm_cuda = torch.sum(torch.pow(torch.stack(grad_norms), 2))
 
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda.item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1603,11 +1514,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     # creates a flat fused tensor from the tensor list starting at the first_offset
     # in the first tensor of the list. If there are not enough elements in the tensor
     # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           return_tensor_list=False):
+    def get_flat_partition(self, tensor_list, first_offset, partition_size, return_tensor_list=False):
         flat_tensor_list = []
         current_size = 0
         for i, tensor in enumerate(tensor_list):
@@ -1630,10 +1537,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             # we need a narrow view of the tensor based on the tensor offset and number of elements that
             # we need from this tensor
             if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements)))
             else:
                 flat_tensor_list.append(tensor)
 
@@ -1695,13 +1599,9 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         norm_groups = []
         for i, group in enumerate(self.fp16_groups):
             if self.offload_optimizer:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.fp16_groups[i]))
+                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.fp16_groups[i]))
             else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.fp16_groups[i]))
+                norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.fp16_groups[i]))
         return norm_groups
 
     @instrument_w_nvtx
@@ -1720,22 +1620,19 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         # release all the gradient since we have already created a necessary copy in dp_grad_partition
         self.zero_grad(set_to_none=True)
 
-        for grad in filter(lambda g: get_accelerator().on_accelerator(g),
-                           self.averaged_gradients[sub_group_id]):
+        for grad in filter(lambda g: get_accelerator().on_accelerator(g), self.averaged_gradients[sub_group_id]):
             grad.record_stream(get_accelerator().current_stream())
 
         self.averaged_gradients[sub_group_id] = None
 
     @instrument_w_nvtx
     def _prepare_sub_group(self, sub_group_id, timer_names=set()):
-        see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'Before prepare optimizer sub group {sub_group_id}', force=False)
         if self._swappable_optimizer_subgroup(sub_group_id):
             self._optimizer_states_and_gradient_swap_in(sub_group_id, timer_names)
         elif not self.offload_optimizer:
             self._prepare_fp32_grad_for_sub_group(sub_group_id)
-        see_memory_usage(f'After prepare optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'After prepare optimizer sub group {sub_group_id}', force=False)
 
     def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names=set()):
         param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
@@ -1744,8 +1641,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
         OPTIMIZER_SWAP_IN_STATE = 'optimizer_swap_in_state'
-        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'pre-step Before swapping in optimizer tensors {sub_group_id}', force=False)
         self.start_timers([OPTIMIZER_SWAP_IN_STATE])
 
         self.optimizer_swapper.swap_in_optimizer_state(
@@ -1754,21 +1650,18 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         self.stop_timers([OPTIMIZER_SWAP_IN_STATE])
         timer_names.add(OPTIMIZER_SWAP_IN_STATE)
-        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'pre-step After swapping in optimizer tensors {sub_group_id}', force=False)
 
     @instrument_w_nvtx
     def _release_sub_group(self, sub_group_id, timer_names=set()):
-        see_memory_usage(f'Before release optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'Before release optimizer sub group {sub_group_id}', force=False)
         # get rid of the fp32 gradients. Not needed anymore
         if not self.offload_optimizer:
             self.fp32_partitioned_groups_flat[sub_group_id].grad = None
 
         if self._swappable_optimizer_subgroup(sub_group_id):
             self._optimizer_states_and_gradient_swap_out(sub_group_id, timer_names)
-        see_memory_usage(f'After release optimizer sub group {sub_group_id}',
-                         force=False)
+        see_memory_usage(f'After release optimizer sub group {sub_group_id}', force=False)
 
     # create a flat tensor aligned at the alignment boundary
     @instrument_w_nvtx
@@ -1781,9 +1674,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         if remaining:
             elements_to_add = alignment - remaining
-            pad_tensor = torch.zeros(elements_to_add,
-                                     device=tensor_list[0].device,
-                                     dtype=tensor_list[0].dtype)
+            pad_tensor = torch.zeros(elements_to_add, device=tensor_list[0].device, dtype=tensor_list[0].dtype)
             padded_tensor_list = tensor_list + [pad_tensor]
 
             num_elements = num_elements + elements_to_add
@@ -1799,20 +1690,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
         OPTIMIZER_SWAP_OUT_STATE = 'optimizer_swap_out_state'
-        see_memory_usage(
-            f'post-step Before swapping out optimizer tensors {sub_group_id}',
-            force=False)
+        see_memory_usage(f'post-step Before swapping out optimizer tensors {sub_group_id}', force=False)
         self.start_timers([OPTIMIZER_SWAP_OUT_STATE])
 
         self.optimizer_swapper.swap_out_optimizer_state(
             parameter=self.fp32_partitioned_groups_flat[sub_group_id],
-            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id]
-            is not None)
+            async_swap=self.next_swappable_fp32_partitioned_groups[sub_group_id] is not None)
 
         self.stop_timers([OPTIMIZER_SWAP_OUT_STATE])
-        see_memory_usage(
-            f'post-step After swapping out optimizer tensors {sub_group_id}',
-            force=False)
+        see_memory_usage(f'post-step After swapping out optimizer tensors {sub_group_id}', force=False)
         timer_names.add(OPTIMIZER_SWAP_OUT_STATE)
 
         # get rid of the fp32 gradients. Not needed anymore
@@ -1836,12 +1722,6 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         see_memory_usage('After overflow after clearing gradients', force=False)
 
-        if dist.get_rank() == 0:
-            overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
-            if self.dtype == torch.half:
-                overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
-            logger.info(overflow_msg)
-
     @instrument_w_nvtx
     def _overflow_check_and_loss_scale_update(self):
 
@@ -1887,9 +1767,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -1943,7 +1821,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         # warn user about caching allocator flushes
         memory_stats = get_accelerator().memory_stats()
         alloc_retries = memory_stats["num_alloc_retries"] if memory_stats != None else 0
-        if alloc_retries > self.__n_caching_allocator_flushes:
+        if alloc_retries > self.n_caching_allocator_flushes:
             if dist.get_rank() == 0:
                 logger.warning(
                     "%d pytorch allocator cache flushes since last step. this happens "
@@ -1953,8 +1831,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
                     "make the cache flushes go away consider adding "
                     "get_accelerator().empty_cache() calls in your training loop to ensure "
                     "that all ranks flush their caches at the same time",
-                    alloc_retries - self.__n_caching_allocator_flushes)
-            self.__n_caching_allocator_flushes = alloc_retries
+                    alloc_retries - self.n_caching_allocator_flushes)
+            self.n_caching_allocator_flushes = alloc_retries
 
     def dump_pre_step_gradients(self, debug_fp32_grads):
         # Dump gradient norms for debugging
@@ -1971,21 +1849,15 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     def dump_post_step_gradients(self):
         # Dump gradient norms for debugging
         for i, group in enumerate(self.fp16_groups):
-            print(
-                f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
+            print(f'Post-Step Dump Norms for Group {i} FP16P, FP16DS, FP16FLAT, FP32FLAT')
             unflat_fp16 = self.unflatten(self.fp16_groups_flat[i], self.fp16_groups[i])
-            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i],
-                                         self.fp16_groups[i])
+            unflat_fp32 = self.unflatten(self.fp32_partitioned_groups_flat[i], self.fp16_groups[i])
             for j, p in enumerate(self.fp16_groups[i]):
                 param_id = self.get_param_id(p)
                 param_norm = float(p.data.float().norm(2))
                 ds_norm = float(p.ds_tensor.data.float().norm(2))
 
-                unflat_norm = [
-                    float(t.data.float().norm(2))
-                    for t in [unflat_fp16[j],
-                              unflat_fp32[j]]
-                ]
+                unflat_norm = [float(t.data.float().norm(2)) for t in [unflat_fp16[j], unflat_fp32[j]]]
                 norm_list = [param_norm, ds_norm] + unflat_norm
                 print(f'Post-Step Norms {i} {param_id} = {norm_list}')
 
@@ -2022,16 +1894,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
     @instrument_w_nvtx
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            with get_accelerator().stream(self.__reduce_and_partition_stream):
-                self.local_overflow = bool(self.__inf_or_nan_tracker.item())
-                self.__inf_or_nan_tracker.zero_()
+            with get_accelerator().stream(self.reduce_and_partition_stream):
+                self.local_overflow = bool(self.inf_or_nan_tracker.item())
+                self.inf_or_nan_tracker.zero_()
 
             overflow = self.local_overflow
             #overflow = self.has_overflow_partitioned_grads_serial()
             overflow_gpu = get_accelerator().ByteTensor([overflow])
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
         else:
             params = []
@@ -2100,16 +1970,13 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         """get fp32 gradient partition dictionary
         accessed as grad_dict[parameter_group_index][parameter_index]
         """
-        self.__reduce_and_partition_stream.synchronize()
+        self.reduce_and_partition_stream.synchronize()
         grad_dict = collections.defaultdict(dict)
         if self.offload_optimizer:
             for group in self.fp16_groups:
                 for param_idx, param in enumerate(group):
                     group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
-                    fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(
-                        0,
-                        dest_offset,
-                        num_elements)
+                    fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
                     grad_dict[group_idx][param_idx] = fp32_grad
         else:
             for group_idx, group in self.averaged_gradients.items():
@@ -2125,8 +1992,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         my_rank = dist.get_rank(group=self.dp_process_group)
         partitions = [
             reduce_buffer.narrow(0,
-                                 fp32_state.numel() * i,
-                                 fp32_state.numel()) for i in range(self.partition_count)
+                                 fp32_state.numel() * i, fp32_state.numel()) for i in range(self.partition_count)
         ]
         partitions[my_rank].data.copy_(fp32_state.data, non_blocking=False)
 
@@ -2138,14 +2004,12 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         if not param.requires_grad:
             return None
 
-        self.__reduce_and_partition_stream.synchronize()
+        self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
             group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
-            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(
-                0,
-                dest_offset,
-                num_elements).to(device=param.device)
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset,
+                                                                                 num_elements).to(device=param.device)
         else:
             fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
 
@@ -2155,7 +2019,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         if not param.requires_grad:
             return None
 
-        self.__reduce_and_partition_stream.synchronize()
+        self.reduce_and_partition_stream.synchronize()
         group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
 
         if self._swappable_optimizer_subgroup(group_idx):
@@ -2163,14 +2027,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         fp32_param = self.fp32_partitioned_groups_flat[group_idx]
         if optim_state_key is None:
-            fp32_opt_state = fp32_param.narrow(0,
-                                               dest_offset,
-                                               num_elements).to(device=param.device)
+            fp32_opt_state = fp32_param.narrow(0, dest_offset, num_elements).to(device=param.device)
         else:
             fp32_opt_state = self.optimizer.state[fp32_param][optim_state_key].narrow(
-                0,
-                dest_offset,
-                num_elements).to(device=param.device)
+                0, dest_offset, num_elements).to(device=param.device)
 
         hp_param = self._fp32_state_allgather(param, fp32_opt_state)
         if self._swappable_optimizer_subgroup(group_idx):
@@ -2240,10 +2100,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             for key, value in self.optimizer.state[p].items():
                 if torch.is_tensor(value):
                     padded_lens = [t.numel() for t in self.fp16_partitioned_groups[i]]
-                    lean_state[key] = self._get_lean_tensors(
-                        value,
-                        self.fp16_partitioned_groups[i],
-                        self.groups_padding[i])
+                    lean_state[key] = self._get_lean_tensors(value, self.fp16_partitioned_groups[i],
+                                                             self.groups_padding[i])
                     lean_flat_len = sum([t.numel() for t in lean_state[key]])
                 else:
                     lean_state[key] = value
@@ -2256,9 +2114,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         # Return group tensor after removing paddings added for alignment to DP world size.
         groups_without_padding = []
         for i, group in enumerate(groups_with_padding):
-            lean_group = self._get_lean_tensors(group,
-                                                self.fp16_partitioned_groups[i],
-                                                self.groups_padding[i])
+            lean_group = self._get_lean_tensors(group, self.fp16_partitioned_groups[i], self.groups_padding[i])
             groups_without_padding.append(lean_group)
 
         return groups_without_padding
@@ -2300,14 +2156,11 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
             torch.save(checkpoint, "saved.pth")
         """
         if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
+            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
         if self.swap_optimizer or self.params_in_nvme_and_cpu:
             raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
 
         return self._rigid_state_dict()
 
@@ -2329,7 +2182,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
     # Restore base optimizer fp32 weights from ZeRO fp16 weights
     def _restore_from_bit16_weights(self):
-        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat, self.fp32_partitioned_groups_flat):
+        for fp16_partitions, fp32_partition in zip(self.fp16_partitioned_groups_flat,
+                                                   self.fp32_partitioned_groups_flat):
             fp32_partition.data.copy_(fp16_partitions.data)
 
     # Refresh the fp32 master params from the fp16 copies.
@@ -2348,9 +2202,7 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         local_state_partitions = []
         for param_index, param_slices in enumerate(param_partitions):
-            flattened_merged_tensor = self.flatten_dense_tensors_aligned(
-                param_slices,
-                alignment)
+            flattened_merged_tensor = self.flatten_dense_tensors_aligned(param_slices, alignment)
             new_partitions = self.get_data_parallel_partitions(flattened_merged_tensor)
             local_state_partitions.append(new_partitions[partition_id])
 
@@ -2368,15 +2220,10 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         base_optimizer_group_states = []
         for i in range(len(self.optimizer.param_groups)):
             partition_states = {}
-            all_partition_group_states = [
-                sd['base_optimizer_state'][i] for sd in all_state_dict
-            ]
+            all_partition_group_states = [sd['base_optimizer_state'][i] for sd in all_state_dict]
             for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._get_flattened_partition(
-                    all_partition_states)
+                all_partition_states = [all_states[key] for all_states in all_partition_group_states]
+                partition_states[key] = self._get_flattened_partition(all_partition_states)
             base_optimizer_group_states.append(partition_states)
 
         for i, group in enumerate(self.optimizer.param_groups):
@@ -2410,9 +2257,8 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
 
         # update fp16 unflattened params
         for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
-            updated_params = self.unflatten(
-                self.fp16_partitioned_groups_flat[sub_group_id],
-                self.fp16_partitioned_groups[sub_group_id])
+            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id])
 
             for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
                 partitioned_param.data = q.data
@@ -2449,18 +2295,14 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         """
 
         if self.elastic_checkpoint:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support elastic checkpointing, please disable for now."
-            )
+            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
         if self.swap_optimizer or self.params_in_nvme_and_cpu:
             raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now."
-            )
+                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
 
-        self._rigid_load_state_dict(
-            state_dict_list[dist.get_rank(group=self.dp_process_group)],
-            load_optimizer_states=load_optimizer_states)
+        self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
+                                    load_optimizer_states=load_optimizer_states)
 
         if len(self.persistent_parameters) > 0:
             self.persistent_parameters[0].partition(self.persistent_parameters)
@@ -2473,6 +2315,9 @@ class DeepSpeedZeroOptimizer_Stage3(ZeROOptimizer):
         if len(self.persistent_parameters) > 0:
             self.persistent_parameters[0].all_gather(self.persistent_parameters)
 
+    def empty_partition_cache(self):
+        self.parameter_offload.empty_partition_cache()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
@@ -2483,9 +2328,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def estimate_zero3_model_states_mem_needs(total_params,
@@ -2508,8 +2351,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
             if zero_init:
                 cpu_mem = total_params * 18 * gpus_factor * additional_buffer_factor
             else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             18 * gpus_factor) * additional_buffer_factor
+                cpu_mem = total_params * max(4 * num_gpus_per_node, 18 * gpus_factor) * additional_buffer_factor
 
         else:
             gpu_mem = largest_layer_memory + int(2 * total_params / total_gpus)
@@ -2517,8 +2359,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
             if zero_init:
                 cpu_mem = total_params * 16 * gpus_factor * additional_buffer_factor
             else:
-                cpu_mem = total_params * max(4 * num_gpus_per_node,
-                                             16 * gpus_factor) * additional_buffer_factor
+                cpu_mem = total_params * max(4 * num_gpus_per_node, 16 * gpus_factor) * additional_buffer_factor
     else:
         gpu_mem = largest_layer_memory + int(18 * total_params / total_gpus)
         if zero_init:
@@ -2531,9 +2372,7 @@ def estimate_zero3_model_states_mem_needs(total_params,
 
 def model_to_params(model):
     # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
+    total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
 
     largest_layer_params = 0
     for m in model.modules():
@@ -2568,12 +2407,11 @@ def estimate_zero3_model_states_mem_needs_all_live(model,
 
     total_params, largest_layer_params = model_to_params(model)
 
-    estimate_zero3_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        largest_layer_params=largest_layer_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
+    estimate_zero3_model_states_mem_needs_all_cold(total_params=total_params,
+                                                   largest_layer_params=largest_layer_params,
+                                                   num_gpus_per_node=num_gpus_per_node,
+                                                   num_nodes=num_nodes,
+                                                   additional_buffer_factor=additional_buffer_factor)
 
 
 def estimate_zero3_model_states_mem_needs_all_cold(total_params,
@@ -2599,6 +2437,7 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params,
         - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
 
     """
+
     def format_options(cpu_offload, cpu_offload_params, zero_init):
         enabled = []
         padded_cpu_str = f'{OffloadDeviceEnum.cpu:4}'
@@ -2630,11 +2469,9 @@ def estimate_zero3_model_states_mem_needs_all_cold(total_params,
                     cpu_offload=cpu_offload,
                     cpu_offload_params=cpu_offload_params,
                     zero_init=zero_init,
-                    additional_buffer_factor=additional_buffer_factor
-                )
+                    additional_buffer_factor=additional_buffer_factor)
 
                 options_str = format_options(cpu_offload=cpu_offload,
                                              cpu_offload_params=cpu_offload_params,
                                              zero_init=zero_init)
-                print(
-                    f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
+                print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index aef25135320c7402f9f82bcb85508facb396fc97..f5ab3982b4d6096905cc5a5ee358f46a5dbd2cb9 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2019 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -10,14 +11,8 @@ from collections import OrderedDict
 
 from deepspeed.runtime import ZeROOptimizer
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank,
-                                     get_global_norm,
-                                     empty_cache,
-                                     see_memory_usage,
-                                     inf,
-                                     is_model_parallel_parameter,
-                                     align_dense_tensors,
-                                     all_gather_dp_groups)
+from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
+                                     inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups)
 
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
@@ -30,14 +25,9 @@ from deepspeed.runtime.constants import PIPE_REPLICATED
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import UtilsBuilder
 
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            GROUP_PADDINGS,
-                                            PARTITION_COUNT,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            BASE_OPTIMIZER_STATE,
-                                            CLIP_GRAD,
-                                            ZERO_STAGE,
-                                            PARAM_SLICE_MAPPINGS)
+from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT,
+                                            SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE, CLIP_GRAD,
+                                            ZERO_STAGE, PARAM_SLICE_MAPPINGS)
 from deepspeed.utils import link_hp_params
 from deepspeed.checkpoint import enable_universal_checkpoint
 
@@ -53,10 +43,8 @@ def input(msg):
 def split_half_float_double(tensors):
     device_type = get_accelerator().device_name()
     dtypes = [
-        "torch.{}.HalfTensor".format(device_type),
-        "torch.{}.FloatTensor".format(device_type),
-        "torch.{}.DoubleTensor".format(device_type),
-        "torch.{}.BFloat16Tensor".format(device_type)
+        "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
+        "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type)
     ]
     buckets = []
     for i, dtype in enumerate(dtypes):
@@ -110,6 +98,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     For usage examples, refer to TODO: DeepSpeed Tutorial
 
     """
+
     def __init__(self,
                  init_optimizer,
                  param_names,
@@ -168,6 +157,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         # ZeRO stage 1 (False) or 2 (True)
         self.partition_gradients = partition_grads
+        self.zero_stage_string = "ZeRO-2" if partition_grads else "ZeRO-1"
 
         self.timers = timers
 
@@ -179,8 +169,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         self.deepspeed_adam_offload = cpu_offload
 
-        self.device = get_accelerator().current_device_name(
-        ) if not self.cpu_offload else 'cpu'
+        self.device = get_accelerator().current_device_name() if not self.cpu_offload else 'cpu'
 
         self.dp_process_group = dp_process_group
 
@@ -195,9 +184,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         #For MoE models this maybe different for different param group
         #It will be modified during MoE setup later in the init
-        self.real_dp_process_group = [
-            dp_process_group for i in range(len(self.optimizer.param_groups))
-        ]
+        self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
         self.partition_count = [dp_size for i in range(len(self.optimizer.param_groups))]
 
         self.is_gradient_accumulation_boundary = True
@@ -233,12 +220,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         self.fp16_master_weights_and_gradients = fp16_master_weights_and_gradients
 
         if self.fp16_master_weights_and_gradients:
-            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32. Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}. Either disable fp16_master_weights_and_gradients or enable ZeRO-2 Offload with DeepSpeedCPUAdam"
+            assert self.cpu_offload and type(self.optimizer) in [DeepSpeedCPUAdam], \
+            f"fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32."\
+            f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
+            f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
 
         if self.reduce_scatter:
-            assert self.communication_data_type in (torch.float16, torch.bfloat16), f"ZeRO-2 supports only float16 or bfloat16 communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with ZeRO-2 with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with ZeRO-2 with reduce scatter enabled"
+            valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
+            assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
+            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+            assert self.postscale_gradients, "pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
 
         # param flattened by groups
         self.bit16_groups = []
@@ -272,7 +263,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         # align nccl all-gather send buffers to 4-byte boundary
         self.nccl_start_alignment_factor = 2  # 4-byte alignment/sizeof(fp16) = 2
 
-        assert (allgather_bucket_size % self.nccl_start_alignment_factor == 0), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
+        assert (
+            allgather_bucket_size % self.nccl_start_alignment_factor == 0
+        ), f"allgather_bucket_size must be a multiple of nccl_start_alignment_factor, {self.nccl_start_alignment_factor} "
 
         self.all_reduce_print = False
         self.dtype = self.optimizer.param_groups[0]['params'][0].dtype
@@ -289,9 +282,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
             # push this group to list before modify
             # TODO: Explore simplification that avoids the extra book-keeping by pushing the reordered group
-            trainable_parameters = [
-                param for param in param_group['params'] if param.requires_grad
-            ]
+            trainable_parameters = [param for param in param_group['params'] if param.requires_grad]
             self.bit16_groups.append(trainable_parameters)
 
             # not sure why apex was cloning the weights before flattening
@@ -309,9 +300,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             # to the same rank, instead they will belong to 3 ranks (r_m+2, r_m+1, r_m).
             if self.round_robin_gradients:
                 round_robin_tensors, round_robin_indices = self._round_robin_reorder(
-                    self.bit16_groups[i],
-                    dist.get_world_size(group=self.real_dp_process_group[i])
-                )
+                    self.bit16_groups[i], dist.get_world_size(group=self.real_dp_process_group[i]))
             else:
                 round_robin_tensors = self.bit16_groups[i]
                 round_robin_indices = list(range(len(self.bit16_groups[i])))
@@ -323,15 +312,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             self.bit16_groups_flat.append(
                 self.flatten_dense_tensors_aligned(
                     self.round_robin_bit16_groups[i],
-                    self.nccl_start_alignment_factor *
-                    dist.get_world_size(group=self.real_dp_process_group[i])).to(
+                    self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])).to(
                         get_accelerator().current_device_name()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU",
-                             force=False)
+            see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
 
             # Record padding required for alignment
-            if partition_id == dist.get_world_size(
-                    group=self.real_dp_process_group[i]) - 1:
+            if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                 padding = self.bit16_groups_flat[i].numel() - sum(
                     [t.numel() for t in self.round_robin_bit16_groups[i]])
             else:
@@ -339,36 +325,29 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             self.groups_padding.append(padding)
 
             if dist.get_rank(group=self.real_dp_process_group[i]) == 0:
-                see_memory_usage(
-                    f"After Flattening and after emptying param group {i} cache",
-                    force=False)
+                see_memory_usage(f"After Flattening and after emptying param group {i} cache", force=False)
 
             # set model bit16 weight to slices of flattened buffer
             self._update_model_bit16_weights(i)
 
             # divide the flat weights into near equal partition equal to the data parallel degree
             # each process will compute on a different part of the partition
-            data_parallel_partitions = self.get_data_parallel_partitions(
-                self.bit16_groups_flat[i],
-                i)
+            data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i)
             self.parallel_partitioned_bit16_groups.append(data_parallel_partitions)
 
             # verify that data partition start locations are 4-byte aligned
             for partitioned_data in data_parallel_partitions:
-                assert (partitioned_data.data_ptr() %
-                        (2 * self.nccl_start_alignment_factor) == 0)
+                assert (partitioned_data.data_ptr() % (2 * self.nccl_start_alignment_factor) == 0)
 
             # A partition of the fp32 master weights that will be updated by this process.
             # Note that the params in single_partition_of_fp32_groups is cloned and detached
             # from the origin params of the model.
             if not fp16_master_weights_and_gradients:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                        self.device).clone().float().detach())
+                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().float().detach())
             else:
-                self.single_partition_of_fp32_groups.append(
-                    self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                        self.device).clone().half().detach())
+                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().half().detach())
 
             # Set local optimizer to have flat params of its own partition.
             # After this, the local optimizer will only contain its own partition of params.
@@ -377,12 +356,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 i].requires_grad = True  # keep this in case internal optimizer uses it
             param_group['params'] = [self.single_partition_of_fp32_groups[i]]
 
-            partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(
-                group=self.real_dp_process_group[i])
+            partition_size = len(self.bit16_groups_flat[i]) / dist.get_world_size(group=self.real_dp_process_group[i])
             params_in_partition, params_not_in_partition, first_offset = self.get_partition_info(
-                self.round_robin_bit16_groups[i],
-                partition_size,
-                partition_id)
+                self.round_robin_bit16_groups[i], partition_size, partition_id)
 
             self.partition_size.append(partition_size)
             self.params_in_partition.append(params_in_partition)
@@ -399,8 +375,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         self.reduce_bucket_size = int(reduce_bucket_size)
         self.allgather_bucket_size = int(allgather_bucket_size)
 
-        self.reduction_event = get_accelerator().Event(enable_timing=False,
-                                                       blocking=False)
+        self.reduction_event = get_accelerator().Event(enable_timing=False, blocking=False)
         self.reduction_stream = get_accelerator().Stream()
         self.cpu_computation_stream = get_accelerator().Stream()
         self.copy_grad_stream = get_accelerator().Stream()
@@ -449,18 +424,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             self.local_overflow = False
             self.grad_position = {}
             self.temp_grad_buffer_for_cpu_offload = get_accelerator().pin_memory(
-                torch.zeros(largest_param_numel,
-                            device=self.device,
-                            dtype=self.dtype))
-            self.temp_grad_buffer_for_gpu_offload = torch.zeros(
-                largest_param_numel,
-                device=get_accelerator().current_device_name(),
-                dtype=self.dtype)
+                torch.zeros(largest_param_numel, device=self.device, dtype=self.dtype))
+            self.temp_grad_buffer_for_gpu_offload = torch.zeros(largest_param_numel,
+                                                                device=get_accelerator().current_device_name(),
+                                                                dtype=self.dtype)
             for i, params_group in enumerate(self.bit16_groups):
-                self.get_grad_position(i,
-                                       self.params_in_partition[i],
-                                       self.first_offset[i],
-                                       self.partition_size[i])
+                self.get_grad_position(i, self.params_in_partition[i], self.first_offset[i], self.partition_size[i])
 
         # mapping from parameter to partition that it belongs to
         self.param_to_partition_ids = {}
@@ -537,8 +506,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             for lp in self.bit16_groups[i]:
                 if lp._hp_mapping is not None:
                     lp_name = self.param_names[lp]
-                    param_mapping_per_group[
-                        lp_name] = lp._hp_mapping.get_hp_fragment_address()
+                    param_mapping_per_group[lp_name] = lp._hp_mapping.get_hp_fragment_address()
             param_mapping.append(param_mapping_per_group)
 
         return param_mapping
@@ -553,17 +521,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             partition_size = self.bit16_groups_flat[i].numel() // dp_world_size
             flat_hp_partition = self.single_partition_of_fp32_groups[i]
-            link_hp_params(
-                lp_param_list=self.bit16_groups[i],
-                flat_hp_partition=flat_hp_partition,
-                gradient_dict=self.averaged_gradients,
-                offload_gradient_dict=self.offload_gradient_dict,
-                use_offload=self.cpu_offload,
-                param_group_index=i,
-                partition_start=partition_id * partition_size,
-                partition_size=partition_size,
-                partition_optimizer_state=self.optimizer.state[flat_hp_partition],
-                dp_group=self.real_dp_process_group[i])
+            link_hp_params(lp_param_list=self.bit16_groups[i],
+                           flat_hp_partition=flat_hp_partition,
+                           gradient_dict=self.averaged_gradients,
+                           offload_gradient_dict=self.offload_gradient_dict,
+                           use_offload=self.cpu_offload,
+                           param_group_index=i,
+                           partition_start=partition_id * partition_size,
+                           partition_size=partition_size,
+                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
+                           dp_group=self.real_dp_process_group[i])
 
     def is_moe_group(self, group):
         return 'moe' in group and group['moe']
@@ -575,19 +542,19 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion
         if not self.partition_gradients and not self.contiguous_gradients:
             logger.warn(
-                "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental."
-            )
+                "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.")
         assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
 
-        assert any([self.is_moe_group(group) for group in self.optimizer.param_groups]), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
+        assert any(
+            [self.is_moe_group(group) for group in self.optimizer.param_groups]
+        ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
         self.is_moe_param_group = []
         for i, group in enumerate(self.optimizer.param_groups):
             if self.is_moe_group(group):
-                assert all([is_moe_param(param) for param in group['params']]), "All params in MoE group must be MoE params"
-                self.real_dp_process_group[i] = self.expert_dp_process_group[
-                    group['name']]
-                self.partition_count[i] = dist.get_world_size(
-                    group=self.expert_dp_process_group[group['name']])
+                assert all([is_moe_param(param)
+                            for param in group['params']]), "All params in MoE group must be MoE params"
+                self.real_dp_process_group[i] = self.expert_dp_process_group[group['name']]
+                self.partition_count[i] = dist.get_world_size(group=self.expert_dp_process_group[group['name']])
                 self.is_moe_param_group.append(True)
             else:
                 self.is_moe_param_group.append(False)
@@ -638,14 +605,19 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     def initialize_optimizer_states(self):
 
         for i, group in enumerate(self.bit16_groups):
-            single_grad_partition = torch.zeros(
-                int(self.partition_size[i]),
-                dtype=self.single_partition_of_fp32_groups[i].dtype,
-                device=self.device)
+            single_grad_partition = torch.zeros(int(self.partition_size[i]),
+                                                dtype=self.single_partition_of_fp32_groups[i].dtype,
+                                                device=self.device)
             self.single_partition_of_fp32_groups[i].grad = get_accelerator().pin_memory(
                 single_grad_partition) if self.cpu_offload else single_grad_partition
 
-        self.optimizer.step()
+        # Initialize the optimizer states with the flattended fp32 partition.
+        # State initialization for the Adagrad optimizer occurs at construction as opposed to other optimizers
+        # which do lazy initialization of the state at the first call to step.
+        if isinstance(self.optimizer, torch.optim.Adagrad):
+            self.optimizer = torch.optim.Adagrad(self.single_partition_of_fp32_groups, **self.optimizer.defaults)
+        else:
+            self.optimizer.step()
 
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
@@ -709,11 +681,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 self.total_grads_in_partition[i][partition_id] = 0
                 self.initialize_gradient_partition(i, param_group, partition_id)
                 self.is_partition_reduced[i][partition_id] = False
-                self.first_param_index_in_partition[i][
-                    partition_id] = self.get_first_param_index(
-                        i,
-                        param_group,
-                        partition_id)
+                self.first_param_index_in_partition[i][partition_id] = self.get_first_param_index(
+                    i, param_group, partition_id)
 
     def independent_gradient_partition_epilogue(self):
         self.report_ipg_memory_usage(f"In ipg_epilogue before reduce_ipg_grads", 0)
@@ -742,13 +711,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                         device=get_accelerator().current_device_name(),
                         return_tensor_list=True)
                 else:
-                    avg_new = self.get_flat_partition(
-                        self.params_in_partition[i],
-                        self.first_offset[i],
-                        self.partition_size[i],
-                        dtype=self.dtype,
-                        device=get_accelerator().current_device_name(),
-                        return_tensor_list=True)
+                    avg_new = self.get_flat_partition(self.params_in_partition[i],
+                                                      self.first_offset[i],
+                                                      self.partition_size[i],
+                                                      dtype=self.dtype,
+                                                      device=get_accelerator().current_device_name(),
+                                                      return_tensor_list=True)
 
                     for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i], avg_new):
                         accumulated_grad.add_(new_avg_grad)
@@ -769,13 +737,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             total_partitions = dist.get_world_size(group=self.real_dp_process_group[i])
             for partition_id in range(total_partitions):
                 self.is_partition_reduced[i][partition_id] = False
-                self.remaining_grads_in_partition[i][
-                    partition_id] = self.total_grads_in_partition[i][partition_id]
+                self.remaining_grads_in_partition[i][partition_id] = self.total_grads_in_partition[i][partition_id]
 
                 for param_id in self.is_grad_computed[i][partition_id]:
                     self.is_grad_computed[i][partition_id][param_id] = False
 
     def initialize_gradient_partition(self, i, param_group, partition_id):
+
         def set_key_value_list(dictionary, key, value):
             if key in dictionary:
                 dictionary[key].append(value)
@@ -802,25 +770,20 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             param_id = self.get_param_id(param)
 
             if (current_index >= start_index and current_index < end_index):
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
+                set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
                 increment_value(self.total_grads_in_partition[i], partition_id)
 
                 self.is_grad_computed[i][partition_id][param_id] = False
 
-                self.grad_partition_insertion_offset[i][partition_id][
-                    param_id] = current_index - start_index
+                self.grad_partition_insertion_offset[i][partition_id][param_id] = current_index - start_index
                 self.grad_start_offset[i][partition_id][param_id] = 0
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                param_size):
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+            elif start_index > current_index and start_index < (current_index + param_size):
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
-                set_key_value_list(self.param_to_partition_ids[i],
-                                   param_id,
-                                   partition_id)
+                set_key_value_list(self.param_to_partition_ids[i], param_id, partition_id)
                 increment_value(self.total_grads_in_partition[i], partition_id)
 
                 self.is_grad_computed[i][partition_id][param_id] = False
@@ -869,14 +832,12 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     ############### Independent Partition Gradient ########################
     def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         if self.elements_in_ipg_bucket + param.numel() > self.reduce_bucket_size:
-            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads",
-                                         param.numel())
+            self.report_ipg_memory_usage("In ipg_remove_grads before reduce_ipg_grads", param.numel())
             self.reduce_ipg_grads()
             if self.contiguous_gradients and self.overlap_comm:
                 # Swap ipg_index between 0 and 1
                 self.ipg_index = 1 - self.ipg_index
-            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads",
-                                         param.numel())
+            self.report_ipg_memory_usage("In ipg_remove_grads after reduce_ipg_grads", param.numel())
 
         param_id = self.get_param_id(param)
         assert self.params_already_reduced[param_id] == False, \
@@ -884,17 +845,14 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             Gradient computed twice for this partition. \
             Multiple gradient reduction is currently not supported"
 
-        if param.numel() > self.reduce_bucket_size:
-            self.extra_large_param_to_reduce = param
-
-        elif self.contiguous_gradients:
-            # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
-            new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(
-                0,
-                self.elements_in_ipg_bucket,
-                param.numel())
-            new_grad_tensor.copy_(param.grad.view(-1))
-            param.grad.data = new_grad_tensor.data.view_as(param.grad)
+        if self.contiguous_gradients:
+            if param.numel() > self.reduce_bucket_size:
+                self.extra_large_param_to_reduce = param
+            else:
+                # keeping the gradients contiguous to prevent memory fragmentation, and avoid flattening
+                new_grad_tensor = self.ipg_buffer[self.ipg_index].narrow(0, self.elements_in_ipg_bucket, param.numel())
+                new_grad_tensor.copy_(param.grad.view(-1))
+                param.grad.data = new_grad_tensor.data.view_as(param.grad)
 
         self.elements_in_ipg_bucket += param.numel()
 
@@ -969,13 +927,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 #Otherwise averaging is done at the entire buffer level at the end of the loop
                 # MoE param have different groups
                 if self.ipg_bucket_has_moe_params:
-                    process_group = self.expert_dp_process_group[
-                        param.group_name] if is_moe_param(
-                            param) else self.dp_process_group
+                    process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
+                        param) else self.dp_process_group
                     param.grad.data.div_(dist.get_world_size(group=process_group))
 
                 partition_ids = self.param_to_partition_ids[i][param_id]
-                assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}"
+                assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
+                            ]), f"world size {dist.get_world_size(group=process_group)} and p_ids: {partition_ids}"
                 partition_size = self.partition_size[i]
                 # Get all partition ids + their offsets
                 partition_ids_w_offsets = []
@@ -1025,10 +983,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 # dist.barrier()
                 #dist.barrier()
                 dst_rank = dist.get_global_rank(real_dp_process_group[i], dst)
-                async_handle = dist.reduce(grad_slice,
-                                           dst=dst_rank,
-                                           group=real_dp_process_group[i],
-                                           async_op=True)
+                async_handle = dist.reduce(grad_slice, dst=dst_rank, group=real_dp_process_group[i], async_op=True)
                 async_handles.append(async_handle)
 
             for handle in async_handles:
@@ -1060,10 +1015,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 num_elements = partition_size - current_offset
 
             self.grad_position[param_id] = [
-                int(group_id),
-                int(param_start_offset),
-                int(current_offset),
-                int(num_elements)
+                int(group_id), int(param_start_offset),
+                int(current_offset), int(num_elements)
             ]
             current_offset += num_elements
 
@@ -1077,10 +1030,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             for lp_param in self.params_in_partition[param_group_index]:
                 param_id = self.get_param_id(lp_param)
                 [_, _, dest_offset, num_elements] = self.grad_position[param_id]
-                dest_tensor = self.single_partition_of_fp32_groups[
-                    param_group_index].grad.view(-1).narrow(0,
-                                                            dest_offset,
-                                                            num_elements)
+                dest_tensor = self.single_partition_of_fp32_groups[param_group_index].grad.view(-1).narrow(
+                    0, dest_offset, num_elements)
                 self.offload_gradient_dict[param_group_index].append(dest_tensor)
 
     def async_accumulate_grad_in_cpu_via_gpu(self, param):
@@ -1089,55 +1040,35 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
         # copy to a preexisiting buffer to avoid memory allocation penalty
-        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(
-            0,
-            0,
-            param.numel())
+        dest_buffer = self.temp_grad_buffer_for_gpu_offload.view(-1).narrow(0, 0, param.numel())
 
         #buffer for storing gradients for this parameter in CPU
         def buffer_to_accumulate_to_in_cpu():
             if not self.fp16_master_weights_and_gradients:
-                return get_accelerator().pin_memory(
-                    torch.zeros(param.numel(),
-                                dtype=param.dtype,
-                                device=self.device))
+                return get_accelerator().pin_memory(torch.zeros(param.numel(), dtype=param.dtype, device=self.device))
             else:
-                return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-                    0,
-                    dest_offset,
-                    num_elements)
+                return self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
 
         #accumulate gradients into param.grad or parts of it that belongs to this partition
         def accumulate_gradients():
             if not self.fp16_master_weights_and_gradients:
-                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
-                                  non_blocking=True)
+                dest_buffer.copy_(self.accumulated_grads_in_cpu[param_id].view(-1), non_blocking=True)
                 param.grad.data.view(-1).add_(dest_buffer)
             else:
-                dest_buffer.narrow(0,
-                                   source_offset,
-                                   num_elements).copy_(
-                                       self.accumulated_grads_in_cpu[param_id].view(-1),
-                                       non_blocking=True)
-                param.grad.data.view(-1).narrow(
-                    0,
-                    source_offset,
-                    num_elements).add_(dest_buffer.narrow(0,
-                                                          source_offset,
-                                                          num_elements))
+                dest_buffer.narrow(0, source_offset,
+                                   num_elements).copy_(self.accumulated_grads_in_cpu[param_id].view(-1),
+                                                       non_blocking=True)
+                param.grad.data.view(-1).narrow(0, source_offset,
+                                                num_elements).add_(dest_buffer.narrow(0, source_offset, num_elements))
 
         #move accumulated gradients back to CPU
         def copy_gradients_to_cpu():
             if not self.fp16_master_weights_and_gradients:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1),
-                    non_blocking=True)
+                self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1), non_blocking=True)
             else:
-                self.accumulated_grads_in_cpu[param_id].data.copy_(
-                    param.grad.data.view(-1).narrow(0,
-                                                    source_offset,
-                                                    num_elements),
-                    non_blocking=True)
+                self.accumulated_grads_in_cpu[param_id].data.copy_(param.grad.data.view(-1).narrow(
+                    0, source_offset, num_elements),
+                                                                   non_blocking=True)
 
         if param_id not in self.accumulated_grads_in_cpu:
             self.accumulated_grads_in_cpu[param_id] = buffer_to_accumulate_to_in_cpu()
@@ -1177,10 +1108,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         [i, source_offset, dest_offset, num_elements] = self.grad_position[param_id]
 
-        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(
-            0,
-            dest_offset,
-            num_elements)
+        dest_tensor = self.single_partition_of_fp32_groups[i].grad.view(-1).narrow(0, dest_offset, num_elements)
 
         src_tensor = param.grad.view(-1).narrow(0, source_offset, num_elements)
         if not self.fp16_master_weights_and_gradients:
@@ -1220,16 +1148,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-        dist.all_reduce(total_norm_cuda,
-                        op=dist.ReduceOp.SUM,
-                        group=self.dp_process_group)
+        dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
         total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1258,17 +1183,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                     total_size += param_in_partition.numel()
 
             see_memory_usage(f"before copying {total_size} gradients into partition")
-            self.grads_in_partition = torch.empty(
-                int(total_size),
-                dtype=self.dtype,
-                device=get_accelerator().current_device_name())
+            self.grads_in_partition = torch.empty(int(total_size),
+                                                  dtype=self.dtype,
+                                                  device=get_accelerator().current_device_name())
             see_memory_usage(f"after copying {total_size} gradients into partition")
 
         # The allreduce buffer will be rewritten. Copy the gradients in partition to a new buffer
-        new_grad_tensor = self.grads_in_partition.view(-1).narrow(
-            0,
-            self.grads_in_partition_offset,
-            param.numel())
+        new_grad_tensor = self.grads_in_partition.view(-1).narrow(0, self.grads_in_partition_offset, param.numel())
         new_grad_tensor.copy_(param.grad.view(-1))
         param.grad.data = new_grad_tensor.data.view_as(param.grad)
         #print(f"Grad norm after copy to contiguous_buffer {param.grad.data.norm()}")
@@ -1279,17 +1200,16 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             if self.extra_large_param_to_reduce is not None:
                 assert len(self.params_in_ipg_bucket) == 1, "more than 1 param in ipg bucket, this shouldn't happen"
                 _, _, param_id = self.params_in_ipg_bucket[0]
-                assert self.get_param_id(
-                    self.extra_large_param_to_reduce) == param_id, "param in ipg bucket does not match extra-large param"
+                assert self.get_param_id(self.extra_large_param_to_reduce
+                                         ) == param_id, "param in ipg bucket does not match extra-large param"
                 self.average_tensor(self.extra_large_param_to_reduce.grad.view(-1))
                 self.extra_large_param_to_reduce = None
             else:
                 self.average_tensor(self.ipg_buffer[self.ipg_index])
         else:
-            self.buffered_reduce_fallback(
-                None,
-                self.grads_in_ipg_bucket,
-                elements_per_buffer=self.elements_in_ipg_bucket)
+            self.buffered_reduce_fallback(None,
+                                          self.grads_in_ipg_bucket,
+                                          elements_per_buffer=self.elements_in_ipg_bucket)
 
         if self.overlap_comm:
             stream = self.reduction_stream
@@ -1324,8 +1244,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                     elif self.contiguous_gradients:
                         self.copy_grads_in_partition(param)
                 else:  # zero stage 1 - partition only optimizer state
-                    if self.contiguous_gradients and self.is_param_in_current_partition[
-                            param_id]:
+                    if self.contiguous_gradients and self.is_param_in_current_partition[param_id]:
                         self.copy_grads_in_partition(param)
 
         self.grads_in_ipg_bucket = []
@@ -1339,6 +1258,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             self.reduce_independent_p_g_buckets_and_remove_grads(param, i)
 
     def zero_reduced_gradients(self, partition_id, i):
+
         def are_all_related_partitions_reduced(params_id):
             for partition_id in self.param_to_partition_ids[i][params_id]:
                 if not self.is_partition_reduced[i][partition_id]:
@@ -1358,29 +1278,23 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         self.sequential_execution(print_func, message)
 
     def get_grads_to_reduce(self, i, partition_id):
+
         def get_reducible_portion(key):
             grad = self.param_dict[key].grad
             total_elements = grad.numel()
             start = self.grad_start_offset[i][partition_id][key]
-            num_elements = min(
-                total_elements - start,
-                self.partition_size[i] -
-                self.grad_partition_insertion_offset[i][partition_id][key])
+            num_elements = min(total_elements - start,
+                               self.partition_size[i] - self.grad_partition_insertion_offset[i][partition_id][key])
             if not pg_correctness_test:
                 if num_elements == total_elements:
                     return grad
                 else:
-                    return grad.contiguous().view(-1).narrow(0,
-                                                             int(start),
-                                                             int(num_elements))
+                    return grad.contiguous().view(-1).narrow(0, int(start), int(num_elements))
             else:
                 if num_elements == total_elements:
                     return grad.clone()
                 else:
-                    return grad.clone().contiguous().view(-1).narrow(
-                        0,
-                        int(start),
-                        int(num_elements))
+                    return grad.clone().contiguous().view(-1).narrow(0, int(start), int(num_elements))
 
         grads_to_reduce = []
         for key in self.is_grad_computed[i][partition_id]:
@@ -1456,11 +1370,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                     buf.copy_(synced)
 
-    def allreduce_no_retain(self,
-                            bucket,
-                            numel_per_bucket=500000000,
-                            rank=None,
-                            log=None):
+    def allreduce_no_retain(self, bucket, numel_per_bucket=500000000, rank=None, log=None):
         small_bucket = []
         numel = 0
         for tensor in bucket:
@@ -1475,18 +1385,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
     # allows using reduction of gradients instead of using all_reduce
 
-    def buffered_reduce_fallback(self,
-                                 rank,
-                                 grads,
-                                 elements_per_buffer=500000000,
-                                 log=None):
+    def buffered_reduce_fallback(self, rank, grads, elements_per_buffer=500000000, log=None):
         split_buckets = split_half_float_double(grads)
 
         for i, bucket in enumerate(split_buckets):
-            self.allreduce_no_retain(bucket,
-                                     numel_per_bucket=elements_per_buffer,
-                                     rank=rank,
-                                     log=log)
+            self.allreduce_no_retain(bucket, numel_per_bucket=elements_per_buffer, rank=rank, log=log)
 
     #############################################################################
     #############################################################################
@@ -1531,11 +1434,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             if (current_index >= start_index and current_index < end_index):
                 params_in_partition.append(tensor)
 
-            elif start_index > current_index and start_index < (current_index +
-                                                                tensor_size):
+            elif start_index > current_index and start_index < (current_index + tensor_size):
                 params_in_partition.append(tensor)
 
-                assert (first_offset == 0), "This can happen either zero or only once as this must be the first tensor in the partition"
+                assert (first_offset == 0
+                        ), "This can happen either zero or only once as this must be the first tensor in the partition"
                 first_offset = start_index - current_index
 
             else:
@@ -1589,9 +1492,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         if norm_type == inf:
             total_norm = max(g.data.abs().max() for g in gradients)
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
@@ -1609,16 +1510,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                     total_norm += param_norm.item()**2
             # Sum across all model parallel GPUs.
             total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda,
-                            op=dist.ReduceOp.SUM,
-                            group=self.dp_process_group)
+            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
             total_norm = total_norm_cuda[0].item()**(1. / norm_type)
 
-        if total_norm == float(
-                'inf') or total_norm == -float('inf') or total_norm != total_norm:
+        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
             total_norm = -1
 
         return total_norm
@@ -1626,13 +1524,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     # creates a flat fused tensor from the tensor list starting at the first_offset
     # in the first tensor of the list. If there are not enough elements in the tensor
     # list then the flat tensor will be padded with zeros
-    def get_flat_partition(self,
-                           tensor_list,
-                           first_offset,
-                           partition_size,
-                           dtype,
-                           device,
-                           return_tensor_list=False):
+    def get_flat_partition(self, tensor_list, first_offset, partition_size, dtype, device, return_tensor_list=False):
         flat_tensor_list = []
         current_size = 0
         for i, tensor in enumerate(tensor_list):
@@ -1655,10 +1547,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             # we need a narrow view of the tensor based on the tensor offset and number of elements that
             # we need from this tensor
             if tensor_offset > 0 or num_elements < tensor.numel():
-                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(
-                    0,
-                    int(tensor_offset),
-                    int(num_elements)))
+                flat_tensor_list.append(tensor.contiguous().view(-1).narrow(0, int(tensor_offset), int(num_elements)))
             else:
                 flat_tensor_list.append(tensor)
 
@@ -1666,10 +1555,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         # this means its the last partition and does not align with the dp boundary. We need to pad before flattening
         if current_size < partition_size:
-            flat_tensor_list.append(
-                torch.zeros(int(partition_size - current_size),
-                            dtype=dtype,
-                            device=device))
+            flat_tensor_list.append(torch.zeros(int(partition_size - current_size), dtype=dtype, device=device))
 
         if return_tensor_list:
             return flat_tensor_list
@@ -1715,9 +1601,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
     def override_loss_scale(self, loss_scale):
         if loss_scale != self.external_loss_scale:
-            logger.info(
-                f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}'
-            )
+            logger.info(f'[deepspeed] setting loss scale from {self.external_loss_scale} -> {loss_scale}')
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
@@ -1727,14 +1611,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         for i, group in enumerate(self.bit16_groups):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
-                norm_groups.append(
-                    self.complete_grad_norm_calculation_for_cpu_offload(
-                        self.params_in_partition[i]))
+                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]))
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
             else:
-                norm_groups.append(
-                    self.get_grad_norm_direct(self.averaged_gradients[i],
-                                              self.params_in_partition[i]))
+                norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i]))
 
         if self.has_moe_layers:
             self._average_expert_grad_norms(norm_groups)
@@ -1745,18 +1625,18 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
     def get_bit16_param_group(self, group_no):
         bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
         partition_id = dist.get_rank(group=self.real_dp_process_group[group_no])
-        return [
-            bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]
-        ]
+        return [bit16_partitions[dist.get_rank(group=self.real_dp_process_group[group_no])]]
 
     def _optimizer_step(self, group_no):
         original_param_groups = self.optimizer.param_groups
         self.optimizer.param_groups = [original_param_groups[group_no]]
-        from deepspeed.ops.adam import DeepSpeedCPUAdam
-        if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
-            self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
-        else:
-            self.optimizer.step()
+        # Disabling this as the C++ side copy & synchornize is not working correctly
+        #from deepspeed.ops.adam import DeepSpeedCPUAdam
+        #if type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half:
+        #    self.optimizer.step(fp16_param_groups=[self.get_bit16_param_group(group_no)])
+        #else:
+        #    self.optimizer.step()
+        self.optimizer.step()
         self.optimizer.param_groups = original_param_groups
 
     def step(self, closure=None):
@@ -1777,12 +1657,6 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         prev_scale = self.loss_scale
         self._update_scale(self.overflow)
         if self.overflow:
-            if dist.get_rank() == 0:
-                overflow_msg = f"[deepspeed] OVERFLOW! Rank {dist.get_rank()} Skipping step."
-                if self.dtype == torch.half:
-                    overflow_msg += f" Attempted loss scale: {prev_scale}, reducing to {self.loss_scale}"
-                logger.info(overflow_msg)
-
             see_memory_usage('After overflow before clearing gradients')
             self.zero_grad(set_to_none=True)
             if self.cpu_offload:
@@ -1797,29 +1671,34 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             return
 
         # Step 1:- Calculate gradient norm using fp-16 grads
-        see_memory_usage('Before norm calculation')
-        scaled_global_grad_norm = self.scaled_global_norm()
-        self._global_grad_norm = scaled_global_grad_norm / prev_scale
+        if self.dtype == torch.float16:
+            see_memory_usage('Before norm calculation')
+            scaled_global_grad_norm = self.scaled_global_norm()
+            self._global_grad_norm = scaled_global_grad_norm / prev_scale
+            see_memory_usage('After norm before optimizer')
 
-        see_memory_usage('After norm before optimizer')
         # Step 2:- run optimizer and upscaling simultaneously
         for i, group in enumerate(self.bit16_groups):
             self.start_timers([OPTIMIZER_GRADIENTS])
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
                 single_grad_partition = self.single_partition_of_fp32_groups[i].grad
-                self.unscale_and_clip_grads([single_grad_partition],
-                                            scaled_global_grad_norm)
+                if self.dtype == torch.float16:
+                    self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
                 self.stop_timers([OPTIMIZER_GRADIENTS])
                 self.start_timers([OPTIMIZER_STEP])
                 self._optimizer_step(i)
 
-                from deepspeed.ops.adam import DeepSpeedCPUAdam
-                if not (type(self.optimizer) == DeepSpeedCPUAdam
-                        and self.dtype == torch.half):
-                    bit16_partitions = self.parallel_partitioned_bit16_groups[i]
-                    fp32_partition = self.single_partition_of_fp32_groups[i]
-                    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                # Disabled, this is not currently working
+                #from deepspeed.ops.adam import DeepSpeedCPUAdam
+                #if not (type(self.optimizer) == DeepSpeedCPUAdam and self.dtype == torch.half):
+                #    bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                #    fp32_partition = self.single_partition_of_fp32_groups[i]
+                #    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                bit16_partitions = self.parallel_partitioned_bit16_groups[i]
+                fp32_partition = self.single_partition_of_fp32_groups[i]
+                bit16_partitions[partition_id].data.copy_(fp32_partition.data)
 
                 self.stop_timers([OPTIMIZER_STEP])
             else:
@@ -1828,12 +1707,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
                 # create a flat gradients for parameters updated by this process
                 # If we are last partition, ensure we have same size grads and partition size, if not pad with zero tensors
-                if partition_id == dist.get_world_size(
-                        group=self.real_dp_process_group[i]) - 1:
+                if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
                     single_grad_partition = self.flatten_dense_tensors_aligned(
                         self.averaged_gradients[i],
-                        int(self.partition_size[i])).to(
-                            self.single_partition_of_fp32_groups[i].dtype)
+                        int(self.partition_size[i])).to(self.single_partition_of_fp32_groups[i].dtype)
                 else:
                     single_grad_partition = self.flatten(self.averaged_gradients[i]).to(
                         self.single_partition_of_fp32_groups[i].dtype)
@@ -1847,8 +1724,9 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
                 self.averaged_gradients[i] = None
 
-                self.unscale_and_clip_grads([single_grad_partition],
-                                            scaled_global_grad_norm)
+                if self.dtype == torch.float16:
+                    self.unscale_and_clip_grads([single_grad_partition], scaled_global_grad_norm)
+
                 self.stop_timers([OPTIMIZER_GRADIENTS])
 
                 # Step 3:- run the optimizer if no offloading
@@ -1869,11 +1747,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         self.start_timers([OPTIMIZER_ALLGATHER])
         # Gather the updated weights from everyone.
         # Then all partitions of the model parameters are updated and ready for next round forward.
-        all_gather_dp_groups(
-            partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-            dp_process_group=self.real_dp_process_group,
-            start_alignment_factor=self.nccl_start_alignment_factor,
-            allgather_bucket_size=self.allgather_bucket_size)
+        all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                             dp_process_group=self.real_dp_process_group,
+                             start_alignment_factor=self.nccl_start_alignment_factor,
+                             allgather_bucket_size=self.allgather_bucket_size)
 
         self.stop_timers([OPTIMIZER_ALLGATHER])
 
@@ -1888,24 +1765,23 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
     @torch.no_grad()
     def update_lp_params(self):
-        for i, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+        for i, (bit16_partitions, fp32_partition) in enumerate(
+                zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             bit16_partitions[partition_id].data.copy_(fp32_partition.data)
             # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
             # if i == 0:
             #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
 
-        all_gather_dp_groups(
-            partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-            dp_process_group=self.real_dp_process_group,
-            start_alignment_factor=self.nccl_start_alignment_factor,
-            allgather_bucket_size=self.allgather_bucket_size)
+        all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                             dp_process_group=self.real_dp_process_group,
+                             start_alignment_factor=self.nccl_start_alignment_factor,
+                             allgather_bucket_size=self.allgather_bucket_size)
 
     def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
-                scaled_norm = norm * 1.0 / float(
-                    dist.get_world_size(group=self.real_dp_process_group[i]))
+                scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.real_dp_process_group[i]))
                 scaled_norm_tensor = torch.tensor(scaled_norm,
                                                   device=get_accelerator().device_name(),
                                                   dtype=torch.float)
@@ -1949,14 +1825,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
-            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial(
-            )
+            overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial()
             overflow_gpu = get_accelerator().ByteTensor([overflow])
             '''This will capture overflow across all data parallel and expert parallel process
             Since expert parallel process are a subset of data parallel process'''
-            dist.all_reduce(overflow_gpu,
-                            op=dist.ReduceOp.MAX,
-                            group=self.dp_process_group)
+            dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
         else:
             params = []
@@ -2093,9 +1966,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         optimizer_groups_state = []
         for i, group in enumerate(self.optimizer.param_groups):
             p = group['params'][0]
-            lean_optimizer_state = self._get_state_without_padding(
-                self.optimizer.state[p],
-                self.groups_padding[i])
+            lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i])
             optimizer_groups_state.append(lean_optimizer_state)
 
         return optimizer_groups_state
@@ -2123,8 +1994,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             state_dict[BASE_OPTIMIZER_STATE] = self.optimizer.state_dict()
 
         # Remove paddings for DP alignment to enable loading for other alignment values
-        fp32_groups_without_padding = self._get_groups_without_padding(
-            self.single_partition_of_fp32_groups)
+        fp32_groups_without_padding = self._get_groups_without_padding(self.single_partition_of_fp32_groups)
         state_dict[SINGLE_PARTITION_OF_FP32_GROUPS] = fp32_groups_without_padding
 
         state_dict[
@@ -2146,17 +2016,13 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
         for i in range(len(self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            merged_partitions = [
-                sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict
-            ]
+            merged_partitions = [sd[SINGLE_PARTITION_OF_FP32_GROUPS][i] for sd in all_state_dict]
             if self.is_moe_group(self.optimizer.param_groups[i]):
-                ranks = self.get_ep_ranks(
-                    group_name=self.optimizer.param_groups[i]['name'])
+                ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
                 merged_partitions = [merged_partitions[i] for i in ranks]
             flat_merged_partitions = self.flatten_dense_tensors_aligned(
                 merged_partitions,
-                self.nccl_start_alignment_factor *
-                dist.get_world_size(group=self.real_dp_process_group[i]))
+                self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i]))
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, i)
             merged_single_partition_of_fp32_groups.append(dp_partitions[partition_id])
 
@@ -2165,7 +2031,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
 
     # Restore base optimizer fp32 weights from ZeRO fp16 or bfloat16 weights
     def _restore_from_bit16_weights(self):
-        for group_id, (bit16_partitions, fp32_partition) in enumerate(zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
+        for group_id, (bit16_partitions, fp32_partition) in enumerate(
+                zip(self.parallel_partitioned_bit16_groups, self.single_partition_of_fp32_groups)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[group_id])
             fp32_partition.data.copy_(bit16_partitions[partition_id].data)
 
@@ -2178,11 +2045,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         partition_id = dist.get_rank(group=self.real_dp_process_group[group_id])
         alignment = dist.get_world_size(group=self.real_dp_process_group[group_id])
         if torch.is_tensor(all_partition_states[0]):
-            flat_merged_partitions = self.flatten_dense_tensors_aligned(
-                all_partition_states,
-                alignment)
-            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions,
-                                                              group_id)
+            flat_merged_partitions = self.flatten_dense_tensors_aligned(all_partition_states, alignment)
+            dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, group_id)
             return dp_partitions[partition_id]
         else:
             # Assume non-tensor states are not partitioned and equal across ranks, so return first one
@@ -2217,25 +2081,15 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         base_optimizer_group_states = []
         for i in range(len(self.optimizer.param_groups)):
             partition_states = {}
-            all_partition_group_states = [
-                sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict
-            ]
+            all_partition_group_states = [sd[BASE_OPTIMIZER_STATE][i] for sd in all_state_dict]
 
             if self.is_moe_group(self.optimizer.param_groups[i]):
-                ranks = self.get_ep_ranks(
-                    group_name=self.optimizer.param_groups[i]['name'])
-                all_partition_group_states = [
-                    all_partition_group_states[i] for i in ranks
-                ]
+                ranks = self.get_ep_ranks(group_name=self.optimizer.param_groups[i]['name'])
+                all_partition_group_states = [all_partition_group_states[i] for i in ranks]
 
             for key in all_partition_group_states[0].keys():
-                all_partition_states = [
-                    all_states[key] for all_states in all_partition_group_states
-                ]
-                partition_states[key] = self._partition_base_optimizer_state(
-                    key,
-                    all_partition_states,
-                    i)
+                all_partition_states = [all_states[key] for all_states in all_partition_group_states]
+                partition_states[key] = self._partition_base_optimizer_state(key, all_partition_states, i)
             base_optimizer_group_states.append(partition_states)
 
         self._restore_base_optimizer_state(base_optimizer_group_states)
@@ -2246,18 +2100,11 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                         load_from_fp32_weights=False,
                         checkpoint_folder=None):
         if checkpoint_folder:
-            self._load_universal_checkpoint(checkpoint_folder,
-                                            load_optimizer_states,
-                                            load_from_fp32_weights)
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
-            self._load_legacy_checkpoint(state_dict_list,
-                                         load_optimizer_states,
-                                         load_from_fp32_weights)
-
-    def _load_universal_checkpoint(self,
-                                   checkpoint_folder,
-                                   load_optimizer_states,
-                                   load_from_fp32_weights):
+            self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
+
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self._load_hp_checkpoint_state(checkpoint_folder)
 
     @property
@@ -2274,16 +2121,10 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
             for lp in self.bit16_groups[i]:
                 if lp._hp_mapping is not None:
                     #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(
-                        os.path.join(checkpoint_dir,
-                                     self.param_names[lp]),
-                        tp_rank,
-                        tp_world_size)
-
-    def _load_legacy_checkpoint(self,
-                                state_dict_list,
-                                load_optimizer_states=True,
-                                load_from_fp32_weights=False):
+                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+                                                tp_world_size)
+
+    def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, load_from_fp32_weights=False):
         r"""Loading ZeRO checkpoint
 
         Arguments:
@@ -2314,8 +2155,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
         dp_rank = dist.get_rank(group=self.dp_process_group)
         current_rank_sd = state_dict_list[dp_rank]
         self.loss_scaler = current_rank_sd.get('loss_scaler', self.loss_scaler)
-        self.dynamic_loss_scale = current_rank_sd.get('dynamic_loss_scale',
-                                                      self.dynamic_loss_scale)
+        self.dynamic_loss_scale = current_rank_sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
         self.overflow = current_rank_sd.get('overflow', self.overflow)
         self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
 
@@ -2353,8 +2193,7 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                     self._restore_elastic_base_optimizer_state(state_dict_list)
                 else:
                     # loading an elastic checkpoint into rigid exec
-                    self._restore_base_optimizer_state(
-                        current_rank_sd[BASE_OPTIMIZER_STATE])
+                    self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE])
 
         # At this point, the optimizer's references to the model's fp32 parameters are up to date.
         # The optimizer's hyperparameters and internal buffers are also up to date.
@@ -2377,7 +2216,8 @@ class DeepSpeedZeroOptimizer(ZeROOptimizer):
                 self._restore_from_elastic_fp32_weights(state_dict_list)
             else:
                 # For non-elastic checkpoint, simply copying from saved weights of current rank is sufficient.
-                for current, saved in zip(self.single_partition_of_fp32_groups, current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
+                for current, saved in zip(self.single_partition_of_fp32_groups,
+                                          current_rank_sd[SINGLE_PARTITION_OF_FP32_GROUPS]):
                     src_tensor = _get_padded_tensor(saved, current.numel())
                     current.data.copy_(src_tensor.data)
         else:
@@ -2397,9 +2237,7 @@ def _handle_overflow(cpu_sum, x, i):
             if not math.isfinite(float(v)):
                 t_i = v_i
                 break
-        logger.info(
-            f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}"
-        )
+        logger.info(f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}")
 
 
 def estimate_zero2_model_states_mem_needs(total_params,
@@ -2422,9 +2260,7 @@ def estimate_zero2_model_states_mem_needs(total_params,
 
 def model_to_params(model):
     # shared params calculated only once
-    total_params = sum(
-        dict((p.data_ptr(),
-              p.numel()) for p in model.parameters()).values())
+    total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
     return total_params
 
 
@@ -2452,11 +2288,10 @@ def estimate_zero2_model_states_mem_needs_all_live(model,
 
     total_params = model_to_params(model)
 
-    estimate_zero2_model_states_mem_needs_all_cold(
-        total_params=total_params,
-        num_gpus_per_node=num_gpus_per_node,
-        num_nodes=num_nodes,
-        additional_buffer_factor=additional_buffer_factor)
+    estimate_zero2_model_states_mem_needs_all_cold(total_params=total_params,
+                                                   num_gpus_per_node=num_gpus_per_node,
+                                                   num_nodes=num_nodes,
+                                                   additional_buffer_factor=additional_buffer_factor)
 
 
 def estimate_zero2_model_states_mem_needs_all_cold(total_params,
@@ -2480,6 +2315,7 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params,
         - ``additional_buffer_factor``: estimation factor (defaults to 1.5):
 
     """
+
     def format_options(cpu_offload):
         enabled = []
         device = f'{OffloadDeviceEnum.cpu:4}' if cpu_offload else "none"
@@ -2488,19 +2324,16 @@ def estimate_zero2_model_states_mem_needs_all_cold(total_params,
 
     nodes_str = "nodes" if num_nodes > 1 else "node"
     gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU"
-    print(
-        "Estimated memory needed for params, optim states and gradients for a:\n"
-        f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
-        f"SW: Model with {int(total_params/1e6)}M total params.")
+    print("Estimated memory needed for params, optim states and gradients for a:\n"
+          f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n"
+          f"SW: Model with {int(total_params/1e6)}M total params.")
     print("  per CPU  |  per GPU |   Options")
     for cpu_offload in [True, False]:
-        cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(
-            total_params=total_params,
-            num_gpus_per_node=num_gpus_per_node,
-            num_nodes=num_nodes,
-            cpu_offload=cpu_offload,
-            additional_buffer_factor=additional_buffer_factor
-        )
+        cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs(total_params=total_params,
+                                                                 num_gpus_per_node=num_gpus_per_node,
+                                                                 num_nodes=num_nodes,
+                                                                 cpu_offload=cpu_offload,
+                                                                 additional_buffer_factor=additional_buffer_factor)
 
         options_str = format_options(cpu_offload=cpu_offload)
         print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}")
diff --git a/deepspeed/runtime/zero/test.py b/deepspeed/runtime/zero/test.py
index d13c247b3de47a2ccc6681084e64ebe46d833ca0..1904a8d395be80a4322bc25bd1008a80bdd8f982 100644
--- a/deepspeed/runtime/zero/test.py
+++ b/deepspeed/runtime/zero/test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.runtime.zero.contiguous_memory_allocator import ContiguousMemoryAllocator
diff --git a/deepspeed/runtime/zero/tiling.py b/deepspeed/runtime/zero/tiling.py
index d66caea85ee76ef8c0ad84ac0228b22da3ec2f10..af1c5e54737a75cb66b39736718bf62169952a80 100644
--- a/deepspeed/runtime/zero/tiling.py
+++ b/deepspeed/runtime/zero/tiling.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -27,6 +30,7 @@ def split_tensor_along_last_dim(tensor, partitions, contiguous_split_chunks=Fals
 
 
 class TiledLinear(torch.nn.Module):
+
     def __init__(self,
                  in_features,
                  out_features,
@@ -114,10 +118,7 @@ class TiledLinear(torch.nn.Module):
                 local_bias = bias if in_id == (in_splits - 1) else False
 
                 local_in_dim = self.in_parts[in_id + 1] - self.in_parts[in_id]
-                local = linear_cls(local_in_dim,
-                                   local_out_dim,
-                                   bias=local_bias,
-                                   **kwargs)
+                local = linear_cls(local_in_dim, local_out_dim, bias=local_bias, **kwargs)
                 self.linears[out_id].append(local)
 
         # Optionally initialize with a known tensor
@@ -127,13 +128,12 @@ class TiledLinear(torch.nn.Module):
     def forward(self, input_):
         if self.in_splits > 1 and not self.input_is_already_split:
             input_parts = partition(input_.shape[-1], self.in_splits)
-            split_sizes = [
-                input_parts[p + 1] - input_parts[p] for p in range(self.in_splits)
-            ]
+            split_sizes = [input_parts[p + 1] - input_parts[p] for p in range(self.in_splits)]
             inputs = self._split_global_input(input_, split_sizes)
         elif self.in_splits > 1:
             inputs = input_
-            assert len(inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
+            assert len(
+                inputs) == self.in_splits, f"Col splits {self.in_splits} does not match input splits {len(inputs)}"
         else:
             # no splits
             inputs = [input_]
@@ -260,6 +260,7 @@ class TiledLinearReturnBias(TiledLinear):
     """Wrapper for a Linear class that returns its own bias parameter, such as
     used by Megatron-LM.
     """
+
     def _reduce_local_output(self, in_id, out_id, current_out, new_out):
         """Reduces output tensors, but not the returned bias. """
         if current_out is not None:
@@ -273,10 +274,7 @@ class TiledLinearReturnBias(TiledLinear):
         tensor, bias = new_out
         assert tensor is not None
 
-        tensor = super()._reduce_local_output(in_id=in_id,
-                                              out_id=out_id,
-                                              current_out=old_tensor,
-                                              new_out=tensor)
+        tensor = super()._reduce_local_output(in_id=in_id, out_id=out_id, current_out=old_tensor, new_out=tensor)
 
         if bias is None:
             bias = old_bias
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index eafddad3f35fa07be4b7df427ff716d60264ad6a..0250796f793deb76a17de72a313ff0fe45db7a95 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from typing import List
@@ -7,6 +10,7 @@ import torch
 from deepspeed import comm as dist
 from deepspeed.utils import logger
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.utils.nvtx import instrument_w_nvtx
 from deepspeed.accelerator import get_accelerator
@@ -15,9 +19,7 @@ from deepspeed.accelerator import get_accelerator
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     data_parallel_size = int(dist.get_world_size())
     parameter_parallel_size = parameter_parallel_size or data_parallel_size
-    logger.info("data_parallel_size: %s, parameter_parallel_size: %s",
-                data_parallel_size,
-                parameter_parallel_size)
+    logger.info("data_parallel_size: %s, parameter_parallel_size: %s", data_parallel_size, parameter_parallel_size)
     assert data_parallel_size % parameter_parallel_size == 0, \
         'world size should be divisible by parameter parallel size'
     rank = dist.get_rank()
@@ -35,10 +37,7 @@ class ZeRORuntimeException(Exception):
 
 
 ZERO_SUPPORTED_OPTIMIZERS = [
-    torch.optim.Adam,
-    torch.optim.AdamW,
-    FusedAdam,
-    DeepSpeedCPUAdam
+    torch.optim.Adam, torch.optim.AdamW, FusedAdam, DeepSpeedCPUAdam, torch.optim.Adagrad, DeepSpeedCPUAdagrad
 ]
 
 # Add apex FusedAdam to supported list if apex is installed
@@ -52,9 +51,7 @@ except ImportError:
 
 def is_zero_supported_optimizer(optimizer):
     if dist.get_rank() == 0:
-        logger.info(
-            f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}'
-        )
+        logger.info(f'Checking ZeRO support for optimizer={optimizer.__class__.__name__} type={type(optimizer)}')
     return type(optimizer) in ZERO_SUPPORTED_OPTIMIZERS
 
 
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index c48383c01c60df7043ddc0028c8e85e6c0065982..6af894bf8e62e376c3af0d03cc2572f90229c60f 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .logging import logger, log_dist
 from .comms_logging import get_caller_func
diff --git a/deepspeed/utils/comms_logging.py b/deepspeed/utils/comms_logging.py
index d6b0410642a9670318538d5c8df5c8842a0853d5..2400fa55b20ed756290a3ea63344b6c1178d03ad 100644
--- a/deepspeed/utils/comms_logging.py
+++ b/deepspeed/utils/comms_logging.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
 from deepspeed.utils import log_dist
@@ -31,7 +34,7 @@ def calc_bw_log(comm_op, size, duration):
     if comm_op == "all_to_all_single":
         tput = (size / duration)
         busbw = (size / duration) * ((n - 1) / n)
-    elif comm_op == "all_gather" or comm_op == "all_gather_base" or comm_op == "reduce_scatter" or comm_op == "reduce_scatter_base":
+    elif comm_op == "all_gather" or comm_op == "all_gather_into_tensor" or comm_op == "reduce_scatter" or comm_op == "reduce_scatter_tensor":
         size *= n
         tput = (size / duration)
         busbw = (size / duration) * ((n - 1) / n)
@@ -56,6 +59,7 @@ def calc_bw_log(comm_op, size, duration):
 
 
 class CommsLogger:
+
     def __init__(self):
         from deepspeed.comm.constants import COMMS_LOGGER_VERBOSE_DEFAULT, COMMS_LOGGER_DEBUG_DEFAULT, COMMS_LOGGER_PROF_OPS_DEFAULT, COMMS_LOGGER_PROF_ALL_DEFAULT, COMMS_LOGGER_ENABLED_DEFAULT
         self.comms_dict = {}
@@ -111,8 +115,7 @@ class CommsLogger:
         # TODO: Add to tensorboard
         if self.verbose:
             n = dist.get_world_size()
-            log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(
-                latency)
+            log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(latency)
             log_str += " | msg size: " + convert_size(msg_size)
             log_str += " | algbw (Gbps): {:.2f} ".format(algbw)
             log_str += " | busbw (Gbps): {:.2f} ".format(busbw)
diff --git a/deepspeed/utils/debug.py b/deepspeed/utils/debug.py
index f3f3ea56d5cfa6d9c551ec7d2db2ebb41dda995b..b693915e531bf989a2b657553b7369fa2880605c 100644
--- a/deepspeed/utils/debug.py
+++ b/deepspeed/utils/debug.py
@@ -1,5 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-""" debug utils """
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # For lazy import with printflock()
 fcntl = None
@@ -128,6 +130,7 @@ def log_rank_file(rank, *msgs):
 
 
 def print_backward_tensors(tensor):
+
     def _print_bwd_tensors(grad_fn):
         print(f"Backward tensors in {grad_fn}")
         for funcs in grad_fn.next_functions:
@@ -135,9 +138,7 @@ def print_backward_tensors(tensor):
                 try:
                     tensor = getattr(funcs[0], 'variable')
                     print(funcs[0])
-                    print(
-                        f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}"
-                    )
+                    print(f"Tensor - id: {id(tensor)}, shape: {tensor.shape}, data: {tensor}, grad: {tensor.grad}")
                 except AttributeError as e:
                     _print_bwd_tensors(funcs[0])
 
diff --git a/deepspeed/utils/distributed.py b/deepspeed/utils/distributed.py
deleted file mode 100644
index b643f4672e86cae2d925e34b60b7b706d9586ff9..0000000000000000000000000000000000000000
--- a/deepspeed/utils/distributed.py
+++ /dev/null
@@ -1,166 +0,0 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
-import os
-import torch
-from datetime import timedelta
-
-from .logging import logger
-from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT, default_pg_timeout
-
-
-def init_distributed(dist_backend="nccl",
-                     auto_mpi_discovery=True,
-                     distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT,
-                     verbose=True,
-                     timeout=default_pg_timeout,
-                     init_method=None):
-    """Initialize torch.distributed backend, potentially performing MPI discovery if needed
-
-    Arguments:
-        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo
-
-        auto_mpi_discovery Optional (bool). if distributed environment variables are not set, attempt to discover them from MPI
-
-        distributed_port: Optional (int). torch distributed backend port
-
-        verbose: Optional (bool). verbose logging
-
-        timeout: Optional (timedelta). Timeout for operations executed against the process group. Default value equals 30 minutes.
-
-        init_method: Optional (string). Torch distributed, URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified.
-    """
-    required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
-    if auto_mpi_discovery and not all(map(lambda v: v in os.environ, required_env)):
-        if verbose:
-            logger.info(
-                "Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment..."
-            )
-        if in_aml() and not in_dlts():
-            patch_aml_env_for_torch_nccl_backend(verbose=verbose)
-        elif in_aws_sm():
-            patch_aws_sm_env_for_torch_nccl_backend(verbose=verbose)
-        else:
-            mpi_discovery(distributed_port=distributed_port, verbose=verbose)
-
-    if not torch.distributed.is_initialized():
-        if verbose and int(os.getenv('RANK', '0')) == 0:
-            logger.info(
-                "Initializing torch distributed with backend: {}".format(dist_backend))
-        assert isinstance(timeout, timedelta)
-        torch.distributed.init_process_group(backend=dist_backend,
-                                             timeout=timeout,
-                                             init_method=init_method)
-
-
-def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True):
-    """
-    Discovery MPI environment via mpi4py and map to relevant torch.distributed state
-    """
-    from mpi4py import MPI
-    import subprocess
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    world_size = comm.Get_size()
-
-    master_addr = None
-    if rank == 0:
-        hostname_cmd = ["hostname -I"]
-        result = subprocess.check_output(hostname_cmd, shell=True)
-        master_addr = result.decode('utf-8').split()[0]
-    master_addr = comm.bcast(master_addr, root=0)
-
-    # Determine local rank by assuming hostnames are unique
-    proc_name = MPI.Get_processor_name()
-    all_procs = comm.allgather(proc_name)
-    local_rank = sum([i == proc_name for i in all_procs[:rank]])
-
-    os.environ['RANK'] = str(rank)
-    os.environ['WORLD_SIZE'] = str(world_size)
-    os.environ['LOCAL_RANK'] = str(local_rank)
-    os.environ['MASTER_ADDR'] = master_addr
-    os.environ['MASTER_PORT'] = str(distributed_port)
-
-    if verbose:
-        logger.info(
-            "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
-
-    if torch.distributed.is_initialized():
-        assert torch.distributed.get_rank() == rank, "MPI rank {} does not match torch rank {}".format(
-            rank, torch.distributed.get_rank())
-        assert torch.distributed.get_world_size() == world_size, "MPI world size {} does not match torch world size {}".format(
-            world_size, torch.distributed.get_world_size())
-
-
-def in_aml():
-    # Are we running inside an Azure Machine Learning (AML) environment?
-    return 'AZUREML_EXPERIMENT_ID' in os.environ
-
-
-def in_aws_sm():
-    # Are we running inside an AWS SageMaker environment?
-    return 'SM_TRAINING_ENV' in os.environ
-
-
-def in_dlts():
-    # Are we running on a DLTS cluster?
-    return 'DLTS_JOB_ID' in os.environ
-
-
-def patch_aml_env_for_torch_nccl_backend(master_port=6105, verbose=True):
-    """Helper routine to get and set environment variables.
-    This is adapted from Azure ML's documentation available from:
-    https://azure.github.io/azureml-web/docs/cheatsheet/distributed-training/#environment-variables-from-openmpi
-    """
-    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
-    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-    single_node = int(os.environ["OMPI_COMM_WORLD_LOCAL_SIZE"]) == int(
-        os.environ["WORLD_SIZE"])
-
-    if not single_node:
-        master_node_params = os.environ["AZ_BATCH_MASTER_NODE"].split(":")
-        os.environ["MASTER_ADDR"] = master_node_params[0]
-        # Do not overwrite master port with that defined in AZ_BATCH_MASTER_NODE
-        if "MASTER_PORT" not in os.environ:
-            os.environ["MASTER_PORT"] = str(master_port)
-    else:
-        os.environ["MASTER_ADDR"] = os.environ["AZ_BATCHAI_MPI_MASTER_NODE"]
-        os.environ["MASTER_PORT"] = "54965"
-
-    if verbose:
-        logger.info("NCCL_SOCKET_IFNAME original value = {}".format(
-            os.environ["NCCL_SOCKET_IFNAME"]))
-
-    os.environ["NCCL_SOCKET_IFNAME"] = "^docker0,lo"
-    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
-
-    if verbose:
-        logger.info(
-            "Discovered AzureML settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
-
-
-def patch_aws_sm_env_for_torch_nccl_backend(verbose=True):
-    """Helper routine to get and set environment variables when running inside an AWS SageMaker environment.
-    """
-    os.environ["RANK"] = os.environ["OMPI_COMM_WORLD_RANK"]
-    os.environ['LOCAL_RANK'] = os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]
-    os.environ["WORLD_SIZE"] = os.environ["OMPI_COMM_WORLD_SIZE"]
-
-    if verbose:
-        logger.info(
-            "Discovered AWS SageMaker settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}"
-            .format(os.environ['RANK'],
-                    os.environ['LOCAL_RANK'],
-                    os.environ['WORLD_SIZE'],
-                    os.environ['MASTER_ADDR'],
-                    os.environ['MASTER_PORT']))
diff --git a/deepspeed/utils/exceptions.py b/deepspeed/utils/exceptions.py
index 678948c2ef3111b04b8a0d03fa35d11b7314014c..9dba0933cc9e236ec150bfd5439f7ce8e157394a 100644
--- a/deepspeed/utils/exceptions.py
+++ b/deepspeed/utils/exceptions.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 
 class DeprecatedException(Exception):
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
index a4d403a7dd0ac63f39a3fbbdcb34d6be7dd267c3..dc58bb3c780f020429cd2da3e823f2d4449fd8ee 100644
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2021 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # The file has been adapted from https://github.com/NVIDIA/Megatron-LM and retains the following license from the original file
 
@@ -51,8 +52,7 @@ def initialize(ep_size=1, mpu=None):
 
 def _ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
-    assert numerator % denominator == 0, '{} is not divisible by {}'.format(
-        numerator, denominator)
+    assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
 
 
 # Not currently used. Helper function to create a model (tensor) parallel group.
@@ -78,8 +78,7 @@ def _create_model_parallel(model_parallel_size_):
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
     """
-    log_dist(f'Creating model parallel group with size {model_parallel_size_}',
-             ranks=[0])
+    log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0])
     # Get world size and rank. Ensure some consistencies.
     assert dist.is_initialized()
     world_size = dist.get_world_size()
@@ -121,9 +120,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_):
     """
     assert dist.is_initialized()
 
-    log_dist(
-        f'Creating expert and data parallel groups with size {expert_parallel_size_}',
-        ranks=[0])
+    log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0])
     world_size = dist.get_world_size()
     rank = dist.get_rank()
 
@@ -139,9 +136,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_):
         for i in range(expert_parallel_size_):
             ranks = range(i, world_size, expert_parallel_size_)
             group = dist.new_group(ranks)
-            log_dist(
-                f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}',
-                [0])
+            log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0])
             if i == (rank % expert_parallel_size_):
                 _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
 
@@ -153,9 +148,7 @@ def _create_expert_and_data_parallel(expert_parallel_size_):
         for i in range(world_size // expert_parallel_size_):
             ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
             group = dist.new_group(ranks)
-            log_dist(
-                f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}',
-                [0])
+            log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
             if i == (rank // expert_parallel_size_):
                 _EXPERT_PARALLEL_GROUP[group_name] = group
 
diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py
index 1b51efe603840deef09269387b227304ae52bcb9..52dbf71d9562b73d868f71ee7ec7894a187e00f0 100644
--- a/deepspeed/utils/init_on_device.py
+++ b/deepspeed/utils/init_on_device.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2020 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 from typing import Callable
 from torch import Tensor
@@ -35,13 +37,10 @@ class OnDevice(object):
 
         if device == "meta":
             if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__):
-                raise NotImplementedError(
-                    "Meta tensor support is not available, please upgrade to torch 1.10+"
-                )
+                raise NotImplementedError("Meta tensor support is not available, please upgrade to torch 1.10+")
+
+    def fp_tensor_constructor(self, fn: Callable, target_fp_dtype: torch.dtype) -> Callable:
 
-    def fp_tensor_constructor(self,
-                              fn: Callable,
-                              target_fp_dtype: torch.dtype) -> Callable:
         def wrapped_fn(*args, **kwargs) -> Tensor:
             if kwargs.get("device", None) is None:
                 kwargs['device'] = self.device
@@ -53,6 +52,7 @@ class OnDevice(object):
         return wrapped_fn
 
     def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable:
+
         def new_tensor(cls, *args) -> Tensor:
             tensor = OnDevice._orig_torch_empty(0, device=self.device).new_empty(*args)
             if tensor.is_floating_point():
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 631ee2177c65d37af153b314a54595a19ddc927d..1e62d96e10325516b6e0d14ad85b9461a314ead1 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -1,5 +1,9 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import functools
 import logging
 import sys
 import os
@@ -14,6 +18,7 @@ log_levels = {
 
 
 class LoggerFactory:
+
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -29,9 +34,8 @@ class LoggerFactory:
         if name is None:
             raise ValueError("name for logger cannot be None")
 
-        formatter = logging.Formatter(
-            "[%(asctime)s] [%(levelname)s] "
-            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+        formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] "
+                                      "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
 
         logger_ = logging.getLogger(name)
         logger_.setLevel(level)
@@ -46,6 +50,21 @@ class LoggerFactory:
 logger = LoggerFactory.create_logger(name="DeepSpeed", level=logging.INFO)
 
 
+@functools.lru_cache(None)
+def warning_once(*args, **kwargs):
+    """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+    logger.warning(*args, **kwargs)
+
+
+logger.warning_once = warning_once
+
+
 def print_configuration(args, name):
     logger.info("{}:".format(name))
     for arg in sorted(vars(args)):
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
index ac785ed9d2bf5aa2b5e4ed83a89326920534e0ca..ecc29e930954d174772d427f0eeea95867fe200d 100644
--- a/deepspeed/utils/mixed_precision_linkage.py
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -1,36 +1,20 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import types
 from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
 
 
-def link_hp_params(lp_param_list,
-                   flat_hp_partition,
-                   gradient_dict,
-                   offload_gradient_dict,
-                   use_offload,
-                   param_group_index,
-                   partition_start,
-                   partition_size,
-                   partition_optimizer_state,
-                   dp_group):
-    local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list,
-                                                       partition_start,
-                                                       partition_size,
-                                                       dp_group)
+def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
+                   param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group):
+    local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group)
 
     for lp_param, lp_start in local_lp_param_and_offset:
-        lp_param._hp_mapping = get_hp_fragment_mapping(lp_param,
-                                                       lp_start,
-                                                       flat_hp_partition,
-                                                       gradient_dict,
-                                                       offload_gradient_dict,
-                                                       use_offload,
-                                                       param_group_index,
-                                                       partition_start,
-                                                       partition_size,
-                                                       partition_optimizer_state)
+        lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict,
+                                                       offload_gradient_dict, use_offload, param_group_index,
+                                                       partition_start, partition_size, partition_optimizer_state)
 
 
 def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group):
diff --git a/deepspeed/utils/nvtx.py b/deepspeed/utils/nvtx.py
index 881000311c36ab4d4f765e90acd022e01dc86a7f..3823599e7bf20b7c48045390a50cee1c31f42e7d 100644
--- a/deepspeed/utils/nvtx.py
+++ b/deepspeed/utils/nvtx.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.accelerator import get_accelerator
 
@@ -6,6 +9,7 @@ from deepspeed.accelerator import get_accelerator
 def instrument_w_nvtx(func):
     """decorator that causes an NVTX range to be recorded for the duration of the
     function call."""
+
     def wrapped_fn(*args, **kwargs):
         get_accelerator().range_push(func.__qualname__)
         ret_val = func(*args, **kwargs)
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
index 8204735748c5ab7a1b9f71ef8405983644fcafae..d117defc9875d57cab49560837bb3e8b7df43a49 100644
--- a/deepspeed/utils/tensor_fragment.py
+++ b/deepspeed/utils/tensor_fragment.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from dataclasses import dataclass
@@ -48,10 +49,7 @@ def get_full_hp_param(self, optim_state_key=None):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
         lp_frag_address = self._hp_mapping.lp_fragment_address
-        reduce_fragment = torch.narrow(reduce_buffer,
-                                       0,
-                                       lp_frag_address.start,
-                                       lp_frag_address.numel)
+        reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel)
         if optim_state_key is None:
             hp_fragment = self._hp_mapping.hp_fragment
         else:
@@ -72,21 +70,14 @@ def get_full_hp_grad(self):
         else:
             gradient_dict = hp_mapping.gradient_dict
 
-        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[
-                hp_mapping.param_group_index] is None:
-            raise ValueError(
-                "Gradients are only available immediately after backward and before engine step"
-            )
+        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None:
+            raise ValueError("Gradients are only available immediately after backward and before engine step")
 
-        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][
-            self._index_in_param_group]
+        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group]
         hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
 
         lp_frag_address = self._hp_mapping.lp_fragment_address
-        reduce_fragment = torch.narrow(reduce_buffer,
-                                       0,
-                                       lp_frag_address.start,
-                                       lp_frag_address.numel)
+        reduce_fragment = torch.narrow(reduce_buffer, 0, lp_frag_address.start, lp_frag_address.numel)
 
         if self.view(-1).shape == hp_grad_fragment.shape:
             reduce_buffer.data.copy_(hp_grad_fragment.data)
@@ -150,16 +141,8 @@ def safe_get_full_grad(param):
     return None
 
 
-def get_hp_fragment_mapping(lp_param,
-                            lp_start,
-                            flat_hp_partition,
-                            gradient_dict,
-                            offload_gradient_dict,
-                            use_offload,
-                            param_group_index,
-                            partition_start,
-                            partition_size,
-                            optimizer_state_dict):
+def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
+                            param_group_index, partition_start, partition_size, optimizer_state_dict):
     lp_end = lp_param.numel() + lp_start
     hp_start = partition_start
     hp_end = partition_start + partition_size
@@ -170,25 +153,16 @@ def get_hp_fragment_mapping(lp_param,
         f'fragment start {fragment_start} should be < fragment_end {fragment_end}'
 
     fragment_numel = fragment_end - fragment_start
-    hp_frag_address = fragment_address(start=fragment_start - hp_start,
-                                       numel=fragment_numel)
-    hp_fragment_tensor = flat_hp_partition.narrow(0,
-                                                  hp_frag_address.start,
-                                                  hp_frag_address.numel)
+    hp_frag_address = fragment_address(start=fragment_start - hp_start, numel=fragment_numel)
+    hp_fragment_tensor = flat_hp_partition.narrow(0, hp_frag_address.start, hp_frag_address.numel)
     optim_fragment = {
-        key: value.narrow(0,
-                          hp_frag_address.start,
-                          hp_frag_address.numel)
-        for key,
-        value in optimizer_state_dict.items()
+        key: value.narrow(0, hp_frag_address.start, hp_frag_address.numel)
+        for key, value in optimizer_state_dict.items()
         if torch.is_tensor(value) and value.shape == flat_hp_partition.shape
     }
 
-    lp_frag_address = fragment_address(start=fragment_start - lp_start,
-                                       numel=fragment_numel)
-    lp_fragment_tensor = lp_param.flatten().narrow(0,
-                                                   lp_frag_address.start,
-                                                   lp_frag_address.numel)
+    lp_frag_address = fragment_address(start=fragment_start - lp_start, numel=fragment_numel)
+    lp_fragment_tensor = lp_param.flatten().narrow(0, lp_frag_address.start, lp_frag_address.numel)
 
     return tensor_fragment(lp_fragment=lp_fragment_tensor,
                            lp_fragment_address=lp_frag_address,
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 16865c5e56c2a5d1a27744a262407817d9b0cb09..e52eb20d96026ece0b7513edda72daee0caa1587 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2019 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import time
 from numpy import mean
@@ -18,9 +19,8 @@ except ImportError:
 
 
 class CudaEventTimer(object):
-    def __init__(self,
-                 start_event: get_accelerator().Event,
-                 end_event: get_accelerator().Event):
+
+    def __init__(self, start_event: get_accelerator().Event, end_event: get_accelerator().Event):
         self.start_event = start_event
         self.end_event = end_event
 
@@ -32,8 +32,10 @@ class CudaEventTimer(object):
 
 class SynchronizedWallClockTimer:
     """Group of timers. Borrowed from Nvidia Megatron code"""
+
     class Timer:
         """Timer."""
+
         def __init__(self, name):
             self.name_ = name
             self.started_ = False
@@ -102,14 +104,12 @@ class SynchronizedWallClockTimer:
 
     @staticmethod
     def memory_usage():
-        alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() /
-                                                  (1024 * 1024 * 1024))
-        max_alloc = "max_mem_allocated: {:.4f} GB".format(
-            get_accelerator().max_memory_allocated() / (1024 * 1024 * 1024))
-        cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() /
-                                                    (1024 * 1024 * 1024))
-        max_cache = "max_cache_allocated: {:.4f} GB".format(
-            get_accelerator().max_memory_cached() / (1024 * 1024 * 1024))
+        alloc = "mem_allocated: {:.4f} GB".format(get_accelerator().memory_allocated() / (1024 * 1024 * 1024))
+        max_alloc = "max_mem_allocated: {:.4f} GB".format(get_accelerator().max_memory_allocated() /
+                                                          (1024 * 1024 * 1024))
+        cache = "cache_allocated: {:.4f} GB".format(get_accelerator().memory_cached() / (1024 * 1024 * 1024))
+        max_cache = "max_cache_allocated: {:.4f} GB".format(get_accelerator().max_memory_cached() /
+                                                            (1024 * 1024 * 1024))
         return " | {} | {} | {} | {}".format(alloc, max_alloc, cache, max_cache)
 
     def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
@@ -135,6 +135,7 @@ class SynchronizedWallClockTimer:
 
 
 class ThroughputTimer:
+
     def __init__(
         self,
         batch_size,
@@ -203,23 +204,19 @@ class ThroughputTimer:
                             self.global_step_count,
                             self.avg_samples_per_sec(),
                             self.batch_size / self.step_elapsed_time,
-                            round(get_accelerator().memory_allocated() / 1024**3,
-                                  2),
-                            round(get_accelerator().max_memory_allocated() / 1024**3,
-                                  2),
+                            round(get_accelerator().memory_allocated() / 1024**3, 2),
+                            round(get_accelerator().max_memory_allocated() / 1024**3, 2),
                         ))
                     if self.monitor_memory:
                         virt_mem = psutil.virtual_memory()
                         swap = psutil.swap_memory()
-                        self.logging(
-                            "epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}"
-                            .format(
-                                self.epoch_count,
-                                self.micro_step_count,
-                                self.global_step_count,
-                                virt_mem.percent,
-                                swap.percent,
-                            ))
+                        self.logging("epoch={}/micro_step={}/global_step={}, vm %: {}, swap %: {}".format(
+                            self.epoch_count,
+                            self.micro_step_count,
+                            self.global_step_count,
+                            virt_mem.percent,
+                            swap.percent,
+                        ))
                 self.step_elapsed_time = 0
 
     def avg_samples_per_sec(self):
diff --git a/deepspeed/utils/types.py b/deepspeed/utils/types.py
index a6d5ffd24b8f31a4247f18fe80e4d1e945467772..2de4350fbd7a3b7068d12ce512cb55156e1170fd 100644
--- a/deepspeed/utils/types.py
+++ b/deepspeed/utils/types.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from enum import IntEnum
 
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index f00e256bb7879727ab1d785173f4aac6967876da..c5246ff52274e1d6142001ccf085186d3545ce57 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -1,5 +1,9 @@
 #!/usr/bin/env python
-'''Copyright The Microsoft DeepSpeed Team'''
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
 # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
@@ -15,18 +19,25 @@ import math
 import os
 import re
 from collections import OrderedDict
+from dataclasses import dataclass
 
 # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
 # DeepSpeed data structures it has to be available in the current python environment.
 from deepspeed.utils import logger
-from deepspeed.checkpoint.constants import (DS_VERSION,
-                                            OPTIMIZER_STATE_DICT,
-                                            SINGLE_PARTITION_OF_FP32_GROUPS,
-                                            FP32_FLAT_GROUPS,
-                                            ZERO_STAGE,
-                                            PARTITION_COUNT,
-                                            PARAM_SHAPES,
-                                            BUFFER_NAMES)
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
 
 debug = 0
 
@@ -63,39 +74,68 @@ def get_model_state_file(checkpoint_dir, zero_stage):
     return file
 
 
-def get_optim_files(checkpoint_dir):
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
     # XXX: need to test that this simple glob rule works for multi-node setup too
-    optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
-                                                "*_optim_states.pt")),
-                         key=natural_keys)
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
 
-    if len(optim_files) == 0:
-        raise FileNotFoundError(
-            f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
 
-    return optim_files
+    return ckpt_files
 
 
-def parse_model_state(file):
-    state_dict = torch.load(file, map_location=device)
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
 
-    if BUFFER_NAMES not in state_dict:
-        raise ValueError(f"{file} is not a model state checkpoint")
-    buffer_names = state_dict[BUFFER_NAMES]
-    if debug:
-        print("Found buffers:", buffer_names)
 
-    # recover just the buffers while restoring them to fp32 if they were saved in fp16
-    buffers = {
-        k: v.float()
-        for k,
-        v in state_dict["module"].items() if k in buffer_names
-    }
-    param_shapes = state_dict[PARAM_SHAPES]
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
 
-    ds_version = state_dict.get(DS_VERSION, None)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
 
-    return buffers, param_shapes, ds_version
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
 
 
 def parse_optim_states(files, ds_checkpoint_dir):
@@ -132,10 +172,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
         raise ValueError(f"unknown zero stage {zero_stage}")
 
     if zero_stage == 2:
-        fp32_flat_groups = [
-            state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
-            for i in range(len(state_dicts))
-        ]
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
     elif zero_stage == 3:
         # if there is more than one param group, there will be multiple flattened tensors - one
         # flattened tensor per group - for simplicity merge them into a single tensor
@@ -144,8 +181,7 @@ def parse_optim_states(files, ds_checkpoint_dir):
         # will require matching the sub-lists of param_shapes for each param group flattened tensor
 
         fp32_flat_groups = [
-            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
-                      0) for i in range(len(state_dicts))
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
         ]
 
     return zero_stage, world_size, fp32_flat_groups
@@ -163,29 +199,53 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
 
     optim_files = get_optim_files(ds_checkpoint_dir)
     zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
-    print(
-        f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
 
-    model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
-    buffers, param_shapes, ds_version = parse_model_state(model_file)
-    print(f'Parsing checkpoint created by deepspeed=={ds_version}')
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
 
     if zero_stage == 2:
-        return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
-                                                          param_shapes,
-                                                          fp32_flat_groups,
-                                                          buffers)
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
     elif zero_stage == 3:
-        return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
-                                                          param_shapes,
-                                                          fp32_flat_groups,
-                                                          buffers)
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
-def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
-                                               param_shapes,
-                                               fp32_flat_groups,
-                                               buffers):
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
 
     # Reconstruction protocol:
     #
@@ -194,8 +254,7 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
     if debug:
         for i in range(world_size):
             for j in range(len(fp32_flat_groups[0])):
-                print(
-                    f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
 
     # XXX: memory usage doubles here (zero2)
     num_param_groups = len(fp32_flat_groups[0])
@@ -204,26 +263,16 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
         merged_partitions = [sd[i] for sd in fp32_flat_groups]
         full_single_fp32_vector = torch.cat(merged_partitions, 0)
         merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
-    avail_numel = sum([
-        full_single_fp32_vector.numel()
-        for full_single_fp32_vector in merged_single_partition_of_fp32_groups
-    ])
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
 
     if debug:
         wanted_params = sum([len(shapes) for shapes in param_shapes])
-        wanted_numel = sum(
-            [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
         # not asserting if there is a mismatch due to possible padding
         print(f"Have {avail_numel} numels to process.")
         print(f"Need {wanted_numel} numels in {wanted_params} params.")
 
-    state_dict = OrderedDict()
-
-    # buffers
-    state_dict.update(buffers)
-    if debug:
-        print(f"added {len(buffers)} buffers")
-
     # params
     # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
     # out-of-core computing solution
@@ -239,13 +288,8 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
             total_params += 1
 
             if debug:
-                print(
-                    f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
-                )
-            state_dict[name] = full_single_fp32_vector.narrow(
-                0,
-                offset,
-                unpartitioned_numel).view(shape)
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
             offset += unpartitioned_numel
 
         # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
@@ -268,12 +312,28 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
 
         # Sanity check
         if offset != avail_numel:
-            raise ValueError(
-                f"consumed {offset} numels out of {avail_numel} - something is wrong")
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
 
-    print(
-        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
-    )
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
 
     return state_dict
 
@@ -285,15 +345,48 @@ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
     return partitioned_numel, padding_numel
 
 
-def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
-                                               param_shapes,
-                                               fp32_flat_groups,
-                                               buffers):
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
 
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
     # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
     # param, re-consolidating each param, while dealing with padding if any
 
-    avail_numel = fp32_flat_groups[0].numel() * world_size
     # merge list of dicts, preserving order
     param_shapes = {k: v for d in param_shapes for k, v in d.items()}
 
@@ -304,15 +397,9 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
         wanted_params = len(param_shapes)
         wanted_numel = sum(shape.numel() for shape in param_shapes.values())
         # not asserting if there is a mismatch due to possible padding
-        print(f"Have {avail_numel} numels to process.")
-        print(f"Need {wanted_numel} numels in {wanted_params} params.")
-
-    state_dict = OrderedDict()
-
-    # buffers
-    state_dict.update(buffers)
-    if debug:
-        print(f"added {len(buffers)} buffers")
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
 
     # params
     # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
@@ -330,30 +417,41 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
 
         if debug:
             print(
-                f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
             )
 
         # XXX: memory usage doubles here
         state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0,
-                                             offset,
-                                             partitioned_numel)
-                  for i in range(world_size)),
-            0).narrow(0,
-                      0,
-                      unpartitioned_numel).view(shape)
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
         offset += partitioned_numel
 
     offset *= world_size
 
     # Sanity check
     if offset != avail_numel:
-        raise ValueError(
-            f"consumed {offset} numels out of {avail_numel} - something is wrong")
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
 
-    print(
-        f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
-    )
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
 
     return state_dict
 
@@ -465,16 +563,13 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "checkpoint_dir",
-        type=str,
-        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
     parser.add_argument(
         "output_file",
         type=str,
-        help=
-        "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
-    )
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
 
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 9bcfedb8d8f3d9bed0489c7113f3caaa4b715e24..cc1d792fb8107fadbb7becfff4bfd239bf3777c6 100755
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -95,7 +95,7 @@ ENV PYTHON_VERSION=3
 RUN apt-get install -y python3 python3-dev && \
         rm -f /usr/bin/python && \
         ln -s /usr/bin/python3 /usr/bin/python && \
-        curl -O https://bootstrap.pypa.io/get-pip.py && \
+        curl -O https://bootstrap.pypa.io/pip/3.6/get-pip.py && \
         python get-pip.py && \
         rm get-pip.py && \
         pip install --upgrade pip && \
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
deleted file mode 100644
index 0534d934144c0a751cd57ed664417709ea5d6dcf..0000000000000000000000000000000000000000
--- a/docs/Gemfile.lock
+++ /dev/null
@@ -1,304 +0,0 @@
-GEM
-  remote: https://rubygems.org/
-  specs:
-    activesupport (6.0.4.6)
-      concurrent-ruby (~> 1.0, >= 1.0.2)
-      i18n (>= 0.7, < 2)
-      minitest (~> 5.1)
-      tzinfo (~> 1.1)
-      zeitwerk (~> 2.2, >= 2.2.2)
-    addressable (2.8.0)
-      public_suffix (>= 2.0.2, < 5.0)
-    coffee-script (2.4.1)
-      coffee-script-source
-      execjs
-    coffee-script-source (1.11.1)
-    colorator (1.1.0)
-    commonmarker (0.23.4)
-      ruby-enum (~> 0.5)
-    concurrent-ruby (1.1.10)
-    dnsruby (1.61.9)
-      simpleidn (~> 0.1)
-    em-websocket (0.5.3)
-      eventmachine (>= 0.12.9)
-      http_parser.rb (~> 0)
-    ethon (0.15.0)
-      ffi (>= 1.15.0)
-    eventmachine (1.2.7)
-    execjs (2.8.1)
-    faraday (1.10.0)
-      faraday-em_http (~> 1.0)
-      faraday-em_synchrony (~> 1.0)
-      faraday-excon (~> 1.1)
-      faraday-httpclient (~> 1.0)
-      faraday-multipart (~> 1.0)
-      faraday-net_http (~> 1.0)
-      faraday-net_http_persistent (~> 1.0)
-      faraday-patron (~> 1.0)
-      faraday-rack (~> 1.0)
-      faraday-retry (~> 1.0)
-      ruby2_keywords (>= 0.0.4)
-    faraday-em_http (1.0.0)
-    faraday-em_synchrony (1.0.0)
-    faraday-excon (1.1.0)
-    faraday-httpclient (1.0.1)
-    faraday-multipart (1.0.3)
-      multipart-post (>= 1.2, < 3)
-    faraday-net_http (1.0.1)
-    faraday-net_http_persistent (1.2.0)
-    faraday-patron (1.0.0)
-    faraday-rack (1.0.0)
-    faraday-retry (1.0.3)
-    ffi (1.15.5)
-    forwardable-extended (2.6.0)
-    gemoji (3.0.1)
-    github-pages (223)
-      github-pages-health-check (= 1.17.9)
-      jekyll (= 3.9.0)
-      jekyll-avatar (= 0.7.0)
-      jekyll-coffeescript (= 1.1.1)
-      jekyll-commonmark-ghpages (= 0.1.6)
-      jekyll-default-layout (= 0.1.4)
-      jekyll-feed (= 0.15.1)
-      jekyll-gist (= 1.5.0)
-      jekyll-github-metadata (= 2.13.0)
-      jekyll-include-cache (= 0.2.1)
-      jekyll-mentions (= 1.6.0)
-      jekyll-optional-front-matter (= 0.3.2)
-      jekyll-paginate (= 1.1.0)
-      jekyll-readme-index (= 0.3.0)
-      jekyll-redirect-from (= 0.16.0)
-      jekyll-relative-links (= 0.6.1)
-      jekyll-remote-theme (= 0.4.3)
-      jekyll-sass-converter (= 1.5.2)
-      jekyll-seo-tag (= 2.7.1)
-      jekyll-sitemap (= 1.4.0)
-      jekyll-swiss (= 1.0.0)
-      jekyll-theme-architect (= 0.2.0)
-      jekyll-theme-cayman (= 0.2.0)
-      jekyll-theme-dinky (= 0.2.0)
-      jekyll-theme-hacker (= 0.2.0)
-      jekyll-theme-leap-day (= 0.2.0)
-      jekyll-theme-merlot (= 0.2.0)
-      jekyll-theme-midnight (= 0.2.0)
-      jekyll-theme-minimal (= 0.2.0)
-      jekyll-theme-modernist (= 0.2.0)
-      jekyll-theme-primer (= 0.6.0)
-      jekyll-theme-slate (= 0.2.0)
-      jekyll-theme-tactile (= 0.2.0)
-      jekyll-theme-time-machine (= 0.2.0)
-      jekyll-titles-from-headings (= 0.5.3)
-      jemoji (= 0.12.0)
-      kramdown (= 2.3.1)
-      kramdown-parser-gfm (= 1.1.0)
-      liquid (= 4.0.3)
-      mercenary (~> 0.3)
-      minima (= 2.5.1)
-      nokogiri (>= 1.12.5, < 2.0)
-      rouge (= 3.26.0)
-      terminal-table (~> 1.4)
-    github-pages-health-check (1.17.9)
-      addressable (~> 2.3)
-      dnsruby (~> 1.60)
-      octokit (~> 4.0)
-      public_suffix (>= 3.0, < 5.0)
-      typhoeus (~> 1.3)
-    html-pipeline (2.14.0)
-      activesupport (>= 2)
-      nokogiri (>= 1.4)
-    http_parser.rb (0.8.0)
-    i18n (0.9.5)
-      concurrent-ruby (~> 1.0)
-    jekyll (3.9.0)
-      addressable (~> 2.4)
-      colorator (~> 1.0)
-      em-websocket (~> 0.5)
-      i18n (~> 0.7)
-      jekyll-sass-converter (~> 1.0)
-      jekyll-watch (~> 2.0)
-      kramdown (>= 1.17, < 3)
-      liquid (~> 4.0)
-      mercenary (~> 0.3.3)
-      pathutil (~> 0.9)
-      rouge (>= 1.7, < 4)
-      safe_yaml (~> 1.0)
-    jekyll-avatar (0.7.0)
-      jekyll (>= 3.0, < 5.0)
-    jekyll-coffeescript (1.1.1)
-      coffee-script (~> 2.2)
-      coffee-script-source (~> 1.11.1)
-    jekyll-commonmark (1.3.1)
-      commonmarker (~> 0.14)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-commonmark-ghpages (0.1.6)
-      commonmarker (~> 0.17.6)
-      jekyll-commonmark (~> 1.2)
-      rouge (>= 2.0, < 4.0)
-    jekyll-default-layout (0.1.4)
-      jekyll (~> 3.0)
-    jekyll-feed (0.15.1)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-gist (1.5.0)
-      octokit (~> 4.2)
-    jekyll-github-metadata (2.13.0)
-      jekyll (>= 3.4, < 5.0)
-      octokit (~> 4.0, != 4.4.0)
-    jekyll-include-cache (0.2.1)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-mentions (1.6.0)
-      html-pipeline (~> 2.3)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-optional-front-matter (0.3.2)
-      jekyll (>= 3.0, < 5.0)
-    jekyll-paginate (1.1.0)
-    jekyll-readme-index (0.3.0)
-      jekyll (>= 3.0, < 5.0)
-    jekyll-redirect-from (0.16.0)
-      jekyll (>= 3.3, < 5.0)
-    jekyll-relative-links (0.6.1)
-      jekyll (>= 3.3, < 5.0)
-    jekyll-remote-theme (0.4.3)
-      addressable (~> 2.0)
-      jekyll (>= 3.5, < 5.0)
-      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
-      rubyzip (>= 1.3.0, < 3.0)
-    jekyll-sass-converter (1.5.2)
-      sass (~> 3.4)
-    jekyll-seo-tag (2.7.1)
-      jekyll (>= 3.8, < 5.0)
-    jekyll-sitemap (1.4.0)
-      jekyll (>= 3.7, < 5.0)
-    jekyll-swiss (1.0.0)
-    jekyll-theme-architect (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-cayman (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-dinky (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-hacker (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-leap-day (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-merlot (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-midnight (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-minimal (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-modernist (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-primer (0.6.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-github-metadata (~> 2.9)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-slate (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-tactile (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-theme-time-machine (0.2.0)
-      jekyll (> 3.5, < 5.0)
-      jekyll-seo-tag (~> 2.0)
-    jekyll-titles-from-headings (0.5.3)
-      jekyll (>= 3.3, < 5.0)
-    jekyll-watch (2.2.1)
-      listen (~> 3.0)
-    jemoji (0.12.0)
-      gemoji (~> 3.0)
-      html-pipeline (~> 2.2)
-      jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.1)
-      rexml
-    kramdown-parser-gfm (1.1.0)
-      kramdown (~> 2.0)
-    liquid (4.0.3)
-    listen (3.7.1)
-      rb-fsevent (~> 0.10, >= 0.10.3)
-      rb-inotify (~> 0.9, >= 0.9.10)
-    mercenary (0.3.6)
-    mini_portile2 (2.8.0)
-    minima (2.5.1)
-      jekyll (>= 3.5, < 5.0)
-      jekyll-feed (~> 0.9)
-      jekyll-seo-tag (~> 2.1)
-    minimal-mistakes-jekyll (4.24.0)
-      jekyll (>= 3.7, < 5.0)
-      jekyll-feed (~> 0.1)
-      jekyll-gist (~> 1.5)
-      jekyll-include-cache (~> 0.1)
-      jekyll-paginate (~> 1.1)
-      jekyll-sitemap (~> 1.3)
-    minitest (5.15.0)
-    multipart-post (2.1.1)
-    nokogiri (1.13.4)
-      mini_portile2 (~> 2.8.0)
-      racc (~> 1.4)
-    octokit (4.22.0)
-      faraday (>= 0.9)
-      sawyer (~> 0.8.0, >= 0.5.3)
-    pathutil (0.16.2)
-      forwardable-extended (~> 2.6)
-    public_suffix (4.0.7)
-    racc (1.6.0)
-    rb-fsevent (0.11.1)
-    rb-inotify (0.10.1)
-      ffi (~> 1.0)
-    rexml (3.2.5)
-    rouge (3.26.0)
-    ruby-enum (0.9.0)
-      i18n
-    ruby2_keywords (0.0.5)
-    rubyzip (2.3.2)
-    safe_yaml (1.0.5)
-    sass (3.7.4)
-      sass-listen (~> 4.0.0)
-    sass-listen (4.0.0)
-      rb-fsevent (~> 0.9, >= 0.9.4)
-      rb-inotify (~> 0.9, >= 0.9.7)
-    sawyer (0.8.2)
-      addressable (>= 2.3.5)
-      faraday (> 0.8, < 2.0)
-    simpleidn (0.2.1)
-      unf (~> 0.1.4)
-    terminal-table (1.8.0)
-      unicode-display_width (~> 1.1, >= 1.1.1)
-    thread_safe (0.3.6)
-    typhoeus (1.4.0)
-      ethon (>= 0.9.0)
-    tzinfo (1.2.9)
-      thread_safe (~> 0.1)
-    tzinfo-data (1.2021.5)
-      tzinfo (>= 1.0.0)
-    unf (0.1.4)
-      unf_ext
-    unf_ext (0.0.8)
-    unicode-display_width (1.8.0)
-    wdm (0.1.1)
-    zeitwerk (2.5.4)
-
-PLATFORMS
-  ruby
-
-DEPENDENCIES
-  github-pages
-  jekyll-feed
-  jekyll-include-cache
-  jekyll-paginate
-  jekyll-remote-theme
-  minimal-mistakes-jekyll
-  tzinfo (~> 1.2)
-  tzinfo-data
-  wdm (~> 0.1.1)
-
-BUNDLED WITH
-   2.3.8
diff --git a/docs/_layouts/news-home.html b/docs/_layouts/news-home.html
deleted file mode 100644
index 8248eed5b5514fa0424618935bd4b3b634f07165..0000000000000000000000000000000000000000
--- a/docs/_layouts/news-home.html
+++ /dev/null
@@ -1,24 +0,0 @@
----
-layout: archive
----
-
-{{ content }}
-
-
-{% if paginator %}
-  {% assign posts = paginator.posts %}
-{% else %}
-  {% assign posts = site.posts %}
-{% endif %}
-
-
-<h2>{{ site.data.ui-text[site.locale].recent_posts | default: "Recent Posts" }}</h2>
-{% assign news = posts | where: "sneak_preview", "false" %}
-{% for post in news %}
-  {% include archive-single.html %}
-  {% if post.image %}
-    <a href="{{ post.link }}"><img src="{{ post.image }}"></a>
-  {% endif %}
-{% endfor %}
-
-{% include paginator.html %}
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index 2d497bb1b567f4f815fcdf4cf4f39abefc1cbb55..84f2f833f2128a2437035fcea0f5a1921f60b4b1 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -181,7 +181,7 @@ Example of <i>**scheduler**</i>
 
 ### Communication options
 
-<i>**communication_data_type**</i>: [boolean]
+<i>**communication_data_type**</i>: [string]
 
 | Description                                                                                                                   | Default |
 | ----------------------------------------------------------------------------------------------------------------------------- | ------- |
@@ -250,7 +250,7 @@ Example of <i>**scheduler**</i>
 
 | Description                                                                                                                                                                                             | Default |
 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `32`    |
+| <i>**initial_scale_power**</i> is a **fp16** parameter representing the power of the initial dynamic loss scale value. The actual loss scale is computed as 2<sup><i>**initial_scale_power**</i></sup>. | `16`    |
 
 <i>**fp16:loss_scale_window**</i>: [integer]
 
@@ -692,7 +692,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 | Description                                                                                                               | Default |
 |---------------------------------------------------------------------------------------------------------------------------| ------- |
-| Whether to run autotuing experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
+| Whether to run autotuning experiments whose results already exist. Setting it to true would overwrite the existing result. | `false` |
 
 
 <i>**metric**</i>: [string]
@@ -849,7 +849,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 | Description                                                   | Default |
 | ------------------------------------------------------------- | ------- |
-| Inserts torch.cuda.synchronize() at each checkpoint boundary. | `false` |
+| Inserts get_accelerator().synchronize() at each checkpoint boundary. | `false` |
 
 
 <i>**profile**</i>: [boolean]
diff --git a/docs/_pages/features.md b/docs/_pages/features.md
deleted file mode 100644
index 4410f2b1026806a4533087924c898cba172b3f7a..0000000000000000000000000000000000000000
--- a/docs/_pages/features.md
+++ /dev/null
@@ -1,347 +0,0 @@
----
-title: "Feature Overview"
-layout: single
-permalink: /features/
-toc: true
-toc_label: "Contents"
----
-
-## Distributed Training with Mixed Precision
-
-### Mixed Precision Training
-Enable 16-bit (FP16) training by in the `deepspeed_config` JSON.
-```json
-"fp16": {
-    "enabled": true,
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-}
-```
-
-### Single-GPU, Multi-GPU, and Multi-Node Training
-Easily switch between single-GPU, single-node multi-GPU, or multi-node multi-GPU
-execution by specifying resources with a hostfile.
-```bash
-deepspeed --hostfile=<hostfile> \
-	<client_entry.py> <client args> \
-	--deepspeed --deepspeed_config ds_config.json
-```
-The script `<client_entry.py>` will execute on the resources specified in
-[`<hostfile>`](/getting-started/#resource-configuration-multi-node).
-
-## Pipeline Parallelism
-DeepSpeed provides [pipeline parallelism](/tutorials/pipeline/) for memory-
-and communication- efficient training. DeepSpeed supports a hybrid
-combination of data, model, and pipeline parallelism and has scaled to over
-[one trillion parameters using 3D parallelism]({{ site.press_release_v3 }}).
-Pipeline parallelism can also improve communication efficiency and has
-accelerated training by up to 7x on low-bandwidth clusters.
-
-
-## Model Parallelism
-### Support for Custom Model Parallelism
-DeepSpeed supports all forms of model parallelism including tensor slicing
-based approaches such as the
-[Megatron-LM](https://github.com/NVIDIA/Megatron-LM). It does so by only
-requiring the model parallelism framework to provide a *model parallelism
-unit* (`mpu`) that implements a few bookkeeping functionalities:
-
-```python
-mpu.get_model_parallel_rank()
-mpu.get_model_parallel_group()
-mpu.get_model_parallel_world_size()
-
-mpu.get_data_parallel_rank()
-mpu.get_data_parallel_group()
-mpu.get_data_parallel_world_size()
-```
-
-### Integration with Megatron-LM
-DeepSpeed is fully compatible with [Megatron](https://github.com/NVIDIA/Megatron-LM).
-Please see the [Megatron-LM tutorial](/tutorials/megatron/) for details.
-
-
-
-
-## The Zero Redundancy Optimizer
-The Zero Redundancy Optimizer ([ZeRO](https://arxiv.org/abs/1910.02054)) is at
-the heart of DeepSpeed and enables large model training at a scale that is
-simply not possible with model parallelism alone. When enabled, ZeRO allows
-training models with over 13 billion parameters without any model parallelism,
-and up to 200 billion parameter models with model parallelism on current
-generation hardware.
-
-For more details see the [ZeRO paper](https://arxiv.org/abs/1910.02054), [GPT
-tutorial](/tutorials/megatron/) on integration with
-DeepSpeed.
-
-### Optimizer State and Gradient Partitioning
-Optimizer State and Gradient Partitioning in ZeRO reduces the memory consumption of the
-model states (optimizer states, gradients and parameters) by 8x compared to standard
-data parallelism by partitioning these states across data parallel process instead of
-replicating them.
-
-### Activation Partitioning
-Activation Partitioning is a memory optimization in ZeRO that can reduce the memory
-consumed by activations during model parallel training (MP). In MP certain
-activations maybe required by all MP processes, resulting in a replication of
-activations across MP GPUs. Activation Partitioning stores these activations in a
-partitioned state once they are used for computation in the forward propagation. These
-activations are allgathered right before they are needed again during the backward propagation.
-By storing activations in a partitioned state, ZeRO in DeepSpeed can reduce the activation
-memory footprint proportional to the MP degree.
-
-### Constant Buffer Optimization (CBO)
-CBO enables high network and memory throughput while restricting memory usage to a
-constant size. For memory- and network-bound operations such as normalization or
-allreduce collectives, the performance depends on the size of the operand. Simply fusing
-all operands into a single large operand can enable great throughput at the expense of
-unnecessary memory overhead. CBO in DeepSpeed fuses smaller operands into approximately a
-pre-defined sized buffer large enough to achieve great performance without the
-unnecessary memory overhead.
-
-### Contiguous Memory Optimization (CMO)
-CMO reduces memory fragmentation during training, preventing out of memory errors
-due to lack of contiguous memory. Memory fragmentation is a result of interleaving between
-short lived and long lived memory objects. During the forward propagation activation
-checkpoints are long lived but the activations that recomputed are short lived. Similarly,
-during the backward computation, the activation gradients are short lived while the parameter
-gradients are long lived. CMO transfers activation checkpoints and parameter gradients
-to contiguous buffers preventing memory fragmentation.
-
-## ZeRO-Offload
-
-ZeRO-Offload pushes the boundary of the maximum model size that can be trained efficiently using minimal GPU resources, by exploiting computational and memory resources on both GPUs and their host CPUs. It allows training up to 13-billion-parameter models on a single NVIDIA V100 GPU, 10x larger than the state-of-the-art, while retaining high training throughput of over 30 teraflops per GPU.
-
-For more details see the [ZeRO-Offload release blog]( https://www.microsoft.com/en-us/research/?p=689370&secret=iSlooB), and [tutorial](/tutorials/zero-offload/) on integration with DeepSpeed.
-
-## Additional Memory and Bandwidth Optimizations
-
-### Smart Gradient Accumulation
-Gradient accumulation allows running larger batch size with limited memory by breaking an
-effective batch into several sequential micro-batches, and averaging the parameter
-gradients across these micro-batches. Furthermore, instead of averaging the gradients of
-each micro-batch across all GPUs, the gradients are averaged locally during each step of
-the sequence, and a single `allreduce` is done at the end of the sequence to produce the
-averaged gradients for the effective batch across all GPUs. This strategy significantly
-reduces the communication involved over the approach of averaging globally for each
-micro-batch, specially when the number of micro-batches per effective batch is large.
-
-### Communication Overlapping
-During back propagation, DeepSpeed can overlap the communication required for averaging
-parameter gradients that have already been computed with the ongoing gradient computation.
-This computation-communication overlap allows DeepSpeed to achieve higher throughput even
-at modest batch sizes.
-
-## Training Features
-
-### Simplified training API
-The DeepSpeed core API consists of just a handful of methods:
-* initialization: `initialize`
-* training: `backward` and `step`
-* argument parsing: `add_config_arguments`
-* checkpointing : `load_checkpoint` and `store_checkpoint`
-
-DeepSpeed supports most of the features described in this document, via the use of these API,
-along with a `deepspeed_config` JSON file for enabling and disabling the features.
-Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
-
-### Activation Checkpointing API
-
-DeepSpeed's Activation Checkpointing API supports activation checkpoint partitioning,
-cpu checkpointing, and contiguous memory optimizations, while also allowing layerwise
-profiling. Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
-
-
-### Gradient Clipping
-```json
-{
-  "gradient_clipping": 1.0
-}
-```
-DeepSpeed handles gradient clipping under the hood based on the max gradient norm
-specified by the user.
-Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
-
-### Automatic loss scaling with mixed precision
-DeepSpeed internally handles loss scaling for mixed precision training. The parameters
-for loss scaling can be specified in the `deepspeed_config` JSON file.
-Please see the [core API doc](https://deepspeed.readthedocs.io/) for more details.
-
-## Training Optimizers
-
-### 1-bit Adam, 0/1 Adam and 1-bit LAMB optimizers with up to 26x less communication
-
-DeepSpeed has three communication-efficient optimizers called 1-bit Adam, 0/1 Adam and 1-bit LAMB.
-They offer the same convergence as Adam/LAMB, incur up to 26x less communication that enables
-up to 6.6x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
-for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
-please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
-[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
-[0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam)
-and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
-please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888), [0/1 Adam paper](https://arxiv.org/abs/2202.06009) and
-[1-bit LAMB paper](https://arxiv.org/abs/2104.06069).
-
-### Fused Adam optimizer and arbitrary torch.optim.Optimizer
-With DeepSpeed, the user can choose to use a high performance implementation of ADAM from
-NVIDIA, or any training optimizer that extends torch's `torch.optim.Optimizer` class.
-
-### CPU-Adam: High-Performance vectorized implementation of Adam
-We introduce an efficient implementation of Adam optimizer on CPU that improves the parameter-update
-performance by nearly an order of magnitude. We use the AVX SIMD instructions on Intel-x86 architecture
-for the CPU-Adam implementation. We support both AVX-512 and AVX-2 instruction sets. DeepSpeed uses
-AVX-2 by default which can be switched to AVX-512 by setting the build flag, `DS_BUILD_AVX512` to 1 when
-installing DeepSpeed. Using AVX-512, we observe 5.1x to 6.5x speedups considering the model-size between
-1 to 10 billion parameters with respect to torch-adam.
-
-### Memory bandwidth optimized FP16 Optimizer
-Mixed precision training is handled by the DeepSpeed FP16 Optimizer. This optimizer not
-only handles FP16 training but is also highly efficient. The performance of weight update
-is primarily dominated by the memory bandwidth, and the achieved memory bandwidth is
-dependent on the size of the input operands. The FP16 Optimizer is designed to maximize
-the achievable memory bandwidth by merging all the parameters of the model into a single
-large buffer, and applying the weight updates in a single kernel, allowing it to achieve
-high memory bandwidth.
-
-### Large Batch Training with LAMB Optimizer
-<!-- **TODO: port tutorial** -->
-DeepSpeed makes it easy to train with large batch sizes by enabling the LAMB Optimizer.
-For more details on LAMB, see the [LAMB paper](https://arxiv.org/pdf/1904.00962.pdf).
-
-### Memory-Efficient Training with ZeRO Optimizer
-DeepSpeed can train models with up to 13 billion parameters without model parallelism, and
-models with up to 200 billion parameters with 16-way model parallelism. This leap in
-model size is possible through the memory efficiency achieved via the ZeRO Optimizer. For
-more details see [ZeRO paper](https://arxiv.org/abs/1910.02054) .
-
-
-
-## Training Agnostic Checkpointing
-DeepSpeed can simplify checkpointing for you regardless of whether you are using data
-parallel training, model parallel training, mixed-precision training, a mix of these
-three, or using the zero optimizer to enable larger model sizes.
-Please see the [Getting Started](/getting-started/) guide
-and the [core API doc](https://deepspeed.readthedocs.io/) for more details.
-
-## Advanced parameter search
-DeepSpeed supports multiple Learning Rate Schedules to enable faster convergence for
-large batch scaling.
-
-### Learning Rate Range Test
-Please refer to the [Learning Rate Range Test](/tutorials/lrrt/) tutorial.
-
-### 1Cycle Learning Rate Schedule
-Please refer to the [1Cycle Learning Rate Schedule](/tutorials/1Cycle/) tutorial.
-
-
-## Simplified Data Loader
-DeepSpeed abstracts away data parallelism and model parallelism from the user when it
-comes to data loading. Users simply provide a PyTorch dataset, and DeepSpeed data loader
-can automatically handle batch creation appropriately.
-
-## Curriculum Learning
-Please refer to the [Curriculum Learning](/tutorials/curriculum-learning/) tutorial.
-
-## Performance Analysis and Debugging
-
-DeepSpeed provides a set of tools for performance analysis and debugging.
-
-### Wall Clock Breakdown
-
-DeepSpeed provides a detailed breakdown of the time spent
-in different parts of the training.
-This can be enabled by setting the following in the `deepspeed_config` file.
-
-```json
-{
-  "wall_clock_breakdown": true,
-}
-
-```
-
-###  Timing Activation Checkpoint Functions
-
-When activation checkpointing is enabled, profiling the forward and backward time of each checkpoint function can be enabled in the `deepspeed_config` file.
-
-```json
-{
-  "activation_checkpointing": {
-    "profile": true
-  }
-}
-
-```
-
-### Flops Profiler
-
-The DeepSpeed flops profiler measures the time, flops and parameters of a PyTorch model and shows which modules or layers are the bottleneck. When used with the DeepSpeed runtime, the flops profiler can be configured in the `deepspeed_config` file as follows:
-
-```json
-{
-  "flops_profiler": {
-    "enabled": true,
-    "profile_step": 1,
-    "module_depth": -1,
-    "top_modules": 3,
-    "detailed": true,
-    }
-}
-
-```
-The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
-
-
-### Autotuning
-
-The DeepSpeed Autotuner  uses model information, system information, and heuristics to efficiently tune Zero stage, micro batch size, and other Zero configurations. Using the autotuning feature requires no code change from DeepSpeed users. While `"autotuning": {"enabled": true}` is the minimal required to enable auotuning, there are other parameters users can define to configure the autotuning process. Below shows major parameters and their default values in the autotuning configuration. Please refer to the [Autotuning](/tutorials/autotuning) tutorial for more details.
-
-```json
-{
-  "autotuning": {
-    "enabled": true,
-    "results_dir": null,
-    "exps_dir": null,
-    "overwrite": false,
-    "metric": "throughput",
-    "num_nodes": null,
-    "num_gpus": null,
-    "start_profile_step": 3,
-    "end_profile_step": 5,
-    "fast": true,
-    "num_tuning_micro_batch_sizes": 3,
-    "tuner_type": "model_based",
-    "tuner_early_stopping": 5,
-    "tuner_num_trials": 50,
-    "arg_mappings": null
-  }
-}
-
-```
-The flops profiler can also be used as a standalone package. Please refer to the [Flops Profiler](/tutorials/flops-profiler) tutorial for more details.
-
-
-## Sparse Attention
-DeepSpeed offers sparse attention to support long sequences. Please refer to the [Sparse Attention](/tutorials/sparse-attention/) tutorial.
-
-```bash
---deepspeed_sparse_attention
-```
-
-```json
-"sparse_attention": {
-    "mode": "fixed",
-    "block": 16,
-    "different_layout_per_head": true,
-    "num_local_blocks": 4,
-    "num_global_blocks": 1,
-    "attention": "bidirectional",
-    "horizontal_global_attention": false,
-    "num_different_global_patterns": 4
-}
-```
-
-## Mixture of Experts (MoE)
-To learn more about training Mixture of Experts (MoE) models with DeepSpeed, see our [tutorial](https://www.deepspeed.ai/tutorials/mixture-of-experts/) for more details.
diff --git a/docs/_pages/training.md b/docs/_pages/training.md
index 466800a3d9870a53fb593ecb233790c4fdf52df6..e5eee86564d3ffcdd5feb002b412c108071d77b3 100644
--- a/docs/_pages/training.md
+++ b/docs/_pages/training.md
@@ -364,7 +364,7 @@ They offer the same convergence as Adam/LAMB, incur up to 26x less communication
 up to 6.6x higher throughput for BERT-Large pretraining and up to 2.7x higher throughput
 for SQuAD fine-tuning on bandwidth-limited clusters. For more details on usage and performance,
 please refer to the [1-bit Adam tutorial](https://www.deepspeed.ai/tutorials/onebit-adam),
-[1-bit Adam blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.md),
+[1-bit Adam blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html),
 [0/1 Adam tutorial](https://www.deepspeed.ai/tutorials/zero-one-adam)
 and [1-bit LAMB tutorial](https://www.deepspeed.ai/tutorials/onebit-lamb/). For technical details,
 please refer to the [1-bit Adam paper](https://arxiv.org/abs/2102.02888), [0/1 Adam paper](https://arxiv.org/abs/2202.06009) and
diff --git a/docs/_posts/2020-02-13-release.md b/docs/_posts/2020-02-13-release.md
index 792ff7bfee674c07a5827bfb1b5d956542dc51d1..a97a4ba9ccf1b713f6ec25c57ae48b422a70a675 100644
--- a/docs/_posts/2020-02-13-release.md
+++ b/docs/_posts/2020-02-13-release.md
@@ -3,5 +3,5 @@ title: "ZeRO & DeepSpeed: New system optimizations enable training models with o
 date:   2020-02-13
 link: https://www.microsoft.com/en-us/research/blog/ZeRO-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
 excerpt: ""
-tags: training ZeRO
+tags: training ZeRO English
 ---
diff --git a/docs/_posts/2020-02-13-turing-nlg.md b/docs/_posts/2020-02-13-turing-nlg.md
index 0da59aa8fee3558d5d93815c5effe4d7a0db5a76..240f6d78ad022727f7ad6bfb9d5693cc0cde3e7b 100644
--- a/docs/_posts/2020-02-13-turing-nlg.md
+++ b/docs/_posts/2020-02-13-turing-nlg.md
@@ -3,5 +3,5 @@ title: "Turing-NLG: A 17-billion-parameter language model by Microsoft"
 date:   2020-02-13
 link: https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/
 excerpt: "DeepSpeed was used to train the world's largest language model."
-tags: training
+tags: training English
 ---
diff --git a/docs/_posts/2020-03-17-reduce-scatter.md b/docs/_posts/2020-03-17-reduce-scatter.md
index 1753a22e3aa74a8757fe525199d831e98fa8392a..329409dfefabb6555f6622d5dc0f8ea7170e88dd 100644
--- a/docs/_posts/2020-03-17-reduce-scatter.md
+++ b/docs/_posts/2020-03-17-reduce-scatter.md
@@ -1,6 +1,7 @@
 ---
 title: "ZeRO stage 1 with reduced communication"
 sneak_preview: true
+tags: training ZeRO English
 excerpt: "Partition-aware ZeRO with up to 2x reduction in communication time!"
 ---
 
diff --git a/docs/_posts/2020-05-19-bert-record.md b/docs/_posts/2020-05-19-bert-record.md
index 93d0c9ce34bd6033efeaebcaf30dc9751952987a..b47ad0b0beaff409ff1469bee981469b5b40ad7b 100644
--- a/docs/_posts/2020-05-19-bert-record.md
+++ b/docs/_posts/2020-05-19-bert-record.md
@@ -1,10 +1,9 @@
 ---
 title: "The Fastest and Most Efficient BERT Training through Optimized Transformer Kernels"
 excerpt: ""
-tags: training
 date: 2020-05-19 00:00:00
 toc: false
-tags: training
+tags: training English
 ---
 
 We introduce new technology to accelerate single GPU performance via kernel
@@ -18,6 +17,6 @@ NVIDIA V100 GPUs**, compared with the best published result of 67 minutes on
 the same number and generation of GPUs.
 
 * Brief overview, see our [press release](https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 * Tutorial on how to reproduce our results, see our [BERT pre-training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/).
 * The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
diff --git a/docs/_posts/2020-05-19-press-release.md b/docs/_posts/2020-05-19-press-release.md
index 9022a7db40c5d129835cc45e6b7fec876d2df547..a6611b11cb5957a394348eb601faeb1456b7a16b 100644
--- a/docs/_posts/2020-05-19-press-release.md
+++ b/docs/_posts/2020-05-19-press-release.md
@@ -2,6 +2,6 @@
 title: "ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale"
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/
-tags: training ZeRO
+tags: training ZeRO English
 date: 2020-05-19 02:00:00
 ---
diff --git a/docs/_posts/2020-05-19-zero-stage2.md b/docs/_posts/2020-05-19-zero-stage2.md
index 4f35012d9aae04c4691f7ef5b81b1a1e0085b204..44f6cc194bc21f4203c2ce672fdf405ec383da65 100644
--- a/docs/_posts/2020-05-19-zero-stage2.md
+++ b/docs/_posts/2020-05-19-zero-stage2.md
@@ -1,7 +1,7 @@
 ---
 title: "An Order-of-Magnitude Larger and Faster Training with ZeRO-2"
 excerpt: ""
-tags: training ZeRO
+tags: training ZeRO English
 date: 2020-05-19 01:00:00
 toc: false
 ---
diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md
index 99d132c1e53dbfb9112085810867eec233814351..62be6c1bffcea81b13f2d3694742e92522167317 100644
--- a/docs/_posts/2020-05-28-fastest-bert-training.md
+++ b/docs/_posts/2020-05-28-fastest-bert-training.md
@@ -1,7 +1,7 @@
 ---
 title: "Microsoft DeepSpeed achieves the fastest BERT training time"
 excerpt: ""
-tags: training
+tags: training English
 date: 2020-05-28 00:00:00
 ---
 
diff --git a/docs/_posts/2020-07-24-deepspeed-webinar.md b/docs/_posts/2020-07-24-deepspeed-webinar.md
index be4ee777ed617ec7b3e9464907221482a0026533..a5b4aa15bef5441f99e5add88e60ecda7a86746f 100644
--- a/docs/_posts/2020-07-24-deepspeed-webinar.md
+++ b/docs/_posts/2020-07-24-deepspeed-webinar.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Microsoft Research Webinar on August 6th, 2020"
 excerpt: ""
-tags: presentations
+tags: presentations English
 link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
 image: /assets/images/webinar-aug2020.png
 date: 2020-07-24 00:00:00
diff --git a/docs/_posts/2020-08-07-webinar-on-demand.md b/docs/_posts/2020-08-07-webinar-on-demand.md
index 983e17eca36bbd03a6826756119d709a5e0f5236..8b258e88a9b2167738a0453dd73a2491dcb3281f 100644
--- a/docs/_posts/2020-08-07-webinar-on-demand.md
+++ b/docs/_posts/2020-08-07-webinar-on-demand.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Microsoft Research Webinar is now on-demand"
 excerpt: ""
-tags: presentations
+tags: presentations English
 link: https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html
 date: 2020-08-07 00:00:00
 ---
diff --git a/docs/_posts/2020-09-08-sparse-attention-news.md b/docs/_posts/2020-09-08-sparse-attention-news.md
index 4c37054a73c1ee65f425c0434c89890910823abe..79de33a82e3a964e5fc8c7622c0d20f981d3c86e 100644
--- a/docs/_posts/2020-09-08-sparse-attention-news.md
+++ b/docs/_posts/2020-09-08-sparse-attention-news.md
@@ -1,7 +1,7 @@
 ---
 title: "Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention"
 excerpt: ""
-tags: training
+tags: training English
 date: 2020-09-09 00:00:00
 toc: false
 ---
@@ -9,6 +9,6 @@ toc: false
 DeepSpeed offers sparse attention kernels, an instrumental technology to support long sequences of model inputs, whether for text, image, or sound. Compared with the classic dense Transformers, it powers an order-of-magnitude longer input sequence and obtains up to 6x faster execution with comparable accuracy. It also outperforms state-of-the-art sparse implementations with 1.5-3x faster execution. Furthermore, our sparse kernels support efficient execution of flexible sparse format and empower users to innovate on their custom sparse structures.
 
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/sparse-attention.html).
 * Tutorial on how to use sparse attention, see our [Sparse attention tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/).
 * The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md
index c270ceadf3817d94a4a423c497868453c0c48d06..8e2e8423fd5560a6efb31228c4e6f8154cdd1e50 100755
--- a/docs/_posts/2020-09-09-ZeRO-Offload.md
+++ b/docs/_posts/2020-09-09-ZeRO-Offload.md
@@ -2,7 +2,7 @@
 title: "10x bigger model training on a single GPU with ZeRO-Offload"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 toc: false
 ---
 
diff --git a/docs/_posts/2020-09-09-onebit-adam-blog-post.md b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
index 413a3d0c1afbe4883086af4e66a87a93e0bf648d..8152190f24d096fb20d53d9c97d0ece744852339 100644
--- a/docs/_posts/2020-09-09-onebit-adam-blog-post.md
+++ b/docs/_posts/2020-09-09-onebit-adam-blog-post.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed with 1-bit Adam: 5x less communication and 3.4x faster training"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 ---
 
 ## 1. Introduction
diff --git a/docs/_posts/2020-09-09-onebit-adam-news.md b/docs/_posts/2020-09-09-onebit-adam-news.md
index 8873f58ca01a0760516ef96052301c465928ba11..d0adcb09987f2b5d9e5ccec011b45ed1a34a3cc6 100644
--- a/docs/_posts/2020-09-09-onebit-adam-news.md
+++ b/docs/_posts/2020-09-09-onebit-adam-news.md
@@ -2,7 +2,7 @@
 title: "Up to 5x less communication and 3.4x faster training through 1-bit Adam"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
@@ -15,6 +15,6 @@ across distributed devices. We introduce a new algorithm - 1-bit Adam - and
 its efficient implementation in DeepSpeed. 1-bit Adam offers the ***same convergence*** as Adam, incurs up to ***5x less communication*** that enables up to ***3.5x higher throughput for BERT-Large pretraining*** and up to ***2.7x higher throughput for SQuAD fine-tuning*** on bandwidth-limited clusters.
 
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
-* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html).
+* Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html).
 * Tutorial on how to reproduce our results, see our [1-bit Adam tutorial](/tutorials/onebit-adam/).
 * The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/).
diff --git a/docs/_posts/2020-09-09-pipeline-parallelism.md b/docs/_posts/2020-09-09-pipeline-parallelism.md
index 4f2e53ed80ee9f5478b52741708413ca70c94074..48343ebd8d1e90e8da12eeae1cebe28359f778ea 100644
--- a/docs/_posts/2020-09-09-pipeline-parallelism.md
+++ b/docs/_posts/2020-09-09-pipeline-parallelism.md
@@ -2,7 +2,7 @@
 title: "Training a Trillion Parameters with Pipeline Parallelism"
 excerpt: ""
 date: 2020-09-09 00:00:00
-tags: training
+tags: training English
 ---
 
 DeepSpeed includes new support for pipeline parallelism! DeepSpeed's training
diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md
index aa0fa0bb60d48d5bd260dbad63e2f15c95da8619..9675ef1058dd90538fe044cc364264f2576af8f2 100644
--- a/docs/_posts/2020-09-09-sparse-attention.md
+++ b/docs/_posts/2020-09-09-sparse-attention.md
@@ -2,10 +2,10 @@
 title: "DeepSpeed Sparse Attention"
 excerpt: ""
 date: 2020-09-09 01:00:00
-tags: training inference
+tags: training inference English
 ---
 
-Attention-based deep learning models such as the transformers are highly effective in capturing relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
+Attention-based deep learning models such as the transformers are highly effective in capturing the relationship between tokens in an input sequence, even across long distances. As a result, they are used with text, image, and sound-based inputs, where the sequence length can be in thousands of tokens. However, despite the effectiveness of attention modules to capture long term dependencies, in practice, their application to long sequence input is limited by compute and memory requirements of the attention computation that grow quadratically, `O(n^2)`, with the sequence length `n`.
 
 To address this limitation, DeepSpeed offers a suite of sparse attention kernels --an instrumental technology that can reduce the compute and memory requirement of attention computation by orders-of-magnitude via block-sparse computation. The suite not only alleviates the memory bottleneck of attention calculation, but also performs sparse computation efficiently. Its APIs allow convenient integration with any transformer-based models. Along with providing a wide spectrum of sparsity structures, it has the flexibility of handling any user-defined block-sparse structures. More specifically, sparse attention (SA) can be designed to compute local attention between nearby tokens, or global attention via summary tokens computed with local attention. Moreover, SA can also allow random attention, or any combination of local, global, and random attention as shown in the following figure with blue, orange, and green blocks, respectively. As a result, SA decreases the memory footprint to `O(wn)`, in which `1 < w < n` is a parameter, whose value depends on the attention structure.
 
@@ -27,7 +27,7 @@ In a pre-training experiment, we ran BERT model under three settings: dense, den
 
 ![Maximum sequence runnable on BERT](/assets/images/sa_maximum_sequence_runnable_on_bert.png){: .align-center}
 
-* **up to 6.3x faster computation**
+* **Up to 6.3x faster computation**
 We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed:  the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results.
 
 ![Training time for BERT base with varying sequence length](/assets/images/sa_bert_base_time_result.png){: .align-center}
@@ -36,14 +36,14 @@ We continued the pre-training experiment for different batch sizes and sequence
 
 ![Training time for GPT2 with varying sequence length](/assets/images/sa_gpt2_time_result.png){: .align-center}
 
-* **higher accuracy**
+* **Higher accuracy**
 Related works along the line of sparse attention ([Sparse Transformer](https://arxiv.org/pdf/1904.10509.pdf), [Longformer](https://arxiv.org/pdf/2004.05150.pdf), [BigBird](https://arxiv.org/pdf/2007.14062.pdf)) have shown comparable or higher accuracy than full attention. Our experience is well aligned. In addition to lower memory overhead and faster computation, we also observe cases in production where SA reaches higher accuracy and faster convergence. The following chart illustrates accuracy of training a production model based on BERT for long document comprehension (2,048 sequence length). The experiment is performed in three settings: dense starting from scratch, SA starting from scratch, and SA continued training from a checkpoint of using dense with sequence length of 512.  We have observed that, for pre-training from scratch, SA converges faster with higher accuracy comparing with dense. Furthermore, SA continuing training from a pre-trained checkpoint performs even better, with respect to both time and accuracy.
 
 
 ![Accuracy of long document comprehension application](/assets/images/sa_long_document_comprehension_result.png){: .align-center}
 
 
-* **comparison with state of the art, Longformer**
+* **Comparison with state of the art, Longformer**
 We compared SA with Longformer, a state-of-the-art sparse structure and implementation. In our experiment, SA uses `Fixed` sparsity, and two implementations have comparable accuracy. On system performance, SA outperforms Longformer both in training and inference:
   * **1.47x** faster execution pre-training MLM on Wikitext103
 We ran an experiment following the [notebook](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) offered by Longformer. In this experiment, we pre-train an MLM model using RoBERTa-base checkpoint. This is done on 8 V100-SXM2 GPU. Following table shows the details of the result in which using DeepSpeed Sparse Attention shows 1.47x speed up.
@@ -73,7 +73,7 @@ Through our Long Document Comprehension application we described above, we also
 |32                  |1.24            |
 |16                  |1.23            |
 
-* **flexibility to handle any block-sparse structure**
+* **Flexibility to handle any block-sparse structure**
 DeepSpeed Sparse Attention suite does not target at any specific sparse structure but enables model scientists to explore any block sparse structure with efficient system support. Currently, we have added popular sparse structure like:
   * [Fixed](https://arxiv.org/pdf/1904.10509.pdf) (from OpenAI Sparse Transformer)
   * [BigBird](https://arxiv.org/pdf/2007.14062.pdf) (from Google)
diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
index 9664e4de94e75d0dbc0c56cb3490d01971732999..ee518f53f012a551697501d3d6272e75644a3554 100755
--- a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
+++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
@@ -2,7 +2,7 @@
 title: "Progressive Layer Dropping"
 excerpt: ""
 date: 2020-10-29 00:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
diff --git a/docs/_posts/2021-03-08-zero3-offload.md b/docs/_posts/2021-03-08-zero3-offload.md
index 9008ebc9f6faf88b1a3a7f5282b912e338dbccda..2bca2bdd826aa335716d7fd62d93baf6dc8b7e21 100644
--- a/docs/_posts/2021-03-08-zero3-offload.md
+++ b/docs/_posts/2021-03-08-zero3-offload.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed ZeRO-3 Offload"
 excerpt: ""
 date: 2021-03-08 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 ---
 Today we are announcing the release of ZeRO-3 Offload, a highly efficient and easy to use implementation of ZeRO Stage 3 and ZeRO Offload combined, geared towards our continued goal of democratizing AI by making efficient large-scale DL training available to everyone.  The key benefits of ZeRO-3 Offload are:
 
diff --git a/docs/_posts/2021-05-05-MoQ.md b/docs/_posts/2021-05-05-MoQ.md
index e6f7872a40079091f1017294b7ce6244735132ca..5dd5006e886ff497b0eeab7e118eecd5a13a9400 100644
--- a/docs/_posts/2021-05-05-MoQ.md
+++ b/docs/_posts/2021-05-05-MoQ.md
@@ -2,7 +2,7 @@
 title: "Mixture-of-Quantization: A novel quantization approach for reducing model size with minimal accuracy impact"
 excerpt: ""
 date: 2021-05-05 00:00:00
-tags: inference
+tags: inference English
 ---
 
 ## A unified suite for quantization-aware training and inference
diff --git a/docs/_posts/2021-05-05-inference-kernel-optimization.md b/docs/_posts/2021-05-05-inference-kernel-optimization.md
index 63e3ac669e22be1180d7dfea31a7a95f605a9df5..991295de975944e98cc2cbe80d5d4cbb7eebf578 100644
--- a/docs/_posts/2021-05-05-inference-kernel-optimization.md
+++ b/docs/_posts/2021-05-05-inference-kernel-optimization.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed Inference: Multi-GPU inference with customized inference kernels and quantization support"
 excerpt: ""
 date: 2021-03-16 00:00:00
-tags: inference
+tags: inference English
 ---
 While DeepSpeed supports training advanced large-scale models, using these trained models in the desired application scenarios is still challenging due to three major limitations in existing inference solutions: 1) lack of support for multi-GPU inference to fit large models and meet latency requirements, 2) limited GPU kernel performance when running inference with small batch sizes, and 3) difficulties in exploiting quantization, which includes both quantizing the model to reduce the model size and latency as well as supporting high-performance inference of quantized models without specialized hardware.
 
diff --git a/docs/_posts/2021-05-14-inference-release.md b/docs/_posts/2021-05-14-inference-release.md
index fd5cca2e0259051bebb9a59929343707429609d9..14c300d0bc9f4da71f5aa559b5cfc47480e56a10 100644
--- a/docs/_posts/2021-05-14-inference-release.md
+++ b/docs/_posts/2021-05-14-inference-release.md
@@ -3,5 +3,5 @@ title: "DeepSpeed: Accelerating large-scale model inference and training via sys
 date:   2021-05-14
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/
 excerpt: ""
-tags: inference
+tags: inference English
 ---
diff --git a/docs/_posts/2021-08-18-deepspeed-moe.md b/docs/_posts/2021-08-18-deepspeed-moe.md
index 5bd9667f2a7f44ad961e8add7353da84b8baf005..665c09751b553c4a9e802a9f9b6479ab269d93fd 100644
--- a/docs/_posts/2021-08-18-deepspeed-moe.md
+++ b/docs/_posts/2021-08-18-deepspeed-moe.md
@@ -3,5 +3,5 @@ title: "DeepSpeed powers 8x larger MoE model training with high performance"
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/
 date: 2021-08-18 00:00:00
-tags: training
+tags: training English
 ---
diff --git a/docs/_posts/2021-11-15-autotuning.md b/docs/_posts/2021-11-15-autotuning.md
index ee48d44c5bdf1fc7f0947bcdbb52914ab41ebb7b..71acf54438ea4feae0107c22685542c0386117f6 100644
--- a/docs/_posts/2021-11-15-autotuning.md
+++ b/docs/_posts/2021-11-15-autotuning.md
@@ -2,7 +2,7 @@
 title: "Autotuning: Automatically discover the optimal DeepSpeed configuration that delivers good training speed"
 excerpt: ""
 date: 2021-11-16 10:00:00
-tags: training
+tags: training English
 toc: false
 ---
 
diff --git a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
index 6402202cca3b60954a2abb6dc67ab9b4251966c9..40e549d88c2ff710d33e020983a957a467fd7cea 100644
--- a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
+++ b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed-MoE for NLG: Reducing the training cost of language models by 5 times"
 excerpt: ""
 date: 2021-12-09 22:00:00
-tags: training
+tags: training English
 ---
 
 Autoregressive transformer-based natural language generation (referred to as
diff --git a/docs/_posts/2022-01-19-moe-inference.md b/docs/_posts/2022-01-19-moe-inference.md
index f2ac1c6de2e1bd169d4f253dbc356677662fb285..66ff5b51ad2d832c024d90c04d0f818b3d106a11 100644
--- a/docs/_posts/2022-01-19-moe-inference.md
+++ b/docs/_posts/2022-01-19-moe-inference.md
@@ -3,5 +3,5 @@ title: "DeepSpeed: Advancing MoE inference and training to power next-generation
 excerpt: ""
 link: https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/
 date: 2022-01-19 00:00:00
-tags: inference
+tags: inference English
 ---
diff --git a/docs/_posts/2022-03-21-amd-support.md b/docs/_posts/2022-03-21-amd-support.md
index ba8917bc386a80ba1fec6a83036d36fe87babb9e..01b2a52c7ca458f13c7fd0109573d8e6c60ead15 100644
--- a/docs/_posts/2022-03-21-amd-support.md
+++ b/docs/_posts/2022-03-21-amd-support.md
@@ -3,5 +3,5 @@ title: "Supporting efficient large model training on AMD Instinct GPUs with Deep
 excerpt: ""
 link: https://cloudblogs.microsoft.com/opensource/2022/03/21/supporting-efficient-large-model-training-on-amd-instinct-gpus-with-deepspeed/
 date: 2022-03-21 00:00:00
-tags: training ZeRO
+tags: training ZeRO English
 ---
diff --git a/docs/_posts/2022-07-26-deepspeed-azure.md b/docs/_posts/2022-07-26-deepspeed-azure.md
index 128cbf4a416ecba88ac9269f2ce4dd073883b009..51e306693cfe66b41f3df0250f5d41594298d211 100644
--- a/docs/_posts/2022-07-26-deepspeed-azure.md
+++ b/docs/_posts/2022-07-26-deepspeed-azure.md
@@ -2,7 +2,7 @@
 title: "Azure empowers easy-to-use, high-performance, and hyperscale model training using DeepSpeed"
 excerpt: ""
 date: 2022-07-26 00:09:00
-tags: training azure
+tags: training azure English
 ---
 
 ## Introduction
diff --git a/docs/_posts/2022-09-10-zero-inference.md b/docs/_posts/2022-09-10-zero-inference.md
index dd718b9f883906291326a18a18ad7ec54529f6dd..59a3e3bf15fa4e399f0ce16f6cde92d29bdd690a 100644
--- a/docs/_posts/2022-09-10-zero-inference.md
+++ b/docs/_posts/2022-09-10-zero-inference.md
@@ -2,7 +2,7 @@
 title: "ZeRO-Inference: Democratizing massive model inference"
 excerpt: ""
 date: 2022-09-10 00:09:00
-tags: inference ZeRO
+tags: inference ZeRO English
 ---
 
 ## Introduction
diff --git a/docs/_posts/2022-10-11-mii.md b/docs/_posts/2022-10-11-mii.md
index 8a39731759656687fafd7aaf76b9464bccca757d..e0b43f51b1e0a16a5ab196eee599035924ccd56d 100644
--- a/docs/_posts/2022-10-11-mii.md
+++ b/docs/_posts/2022-10-11-mii.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference"
 excerpt: ""
 date: 2022-10-11 00:09:00
-tags: inference
+tags: inference English
 ---
 
 [ ![Text Generation Models](/assets/images/mii/hero.png) ](/assets/images/mii/hero.png){: .align-center}
diff --git a/docs/_posts/2022-12-12-data-efficiency.md b/docs/_posts/2022-12-12-data-efficiency.md
index 3b6adb4d7dab957bd45437da8b00030945bde708..52148707b7673011356c5105edb6a66ee220c564 100644
--- a/docs/_posts/2022-12-12-data-efficiency.md
+++ b/docs/_posts/2022-12-12-data-efficiency.md
@@ -2,7 +2,7 @@
 title: "DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality"
 excerpt: ""
 date: 2022-12-12 00:09:00
-tags: training
+tags: training English
 ---
 
 [ ![DeepSpeed Data Efficiency](/assets/images/data_efficiency/data_efficiecy_fig0.png) ](/assets/images/data_efficiency/data_efficiecy_fig0.png){: .align-center}
diff --git a/docs/_posts/2023-03-31-multi-modal.md b/docs/_posts/2023-03-31-multi-modal.md
new file mode 100644
index 0000000000000000000000000000000000000000..aaef9cfbfd2a19b66d8681eddef1713244c73d3e
--- /dev/null
+++ b/docs/_posts/2023-03-31-multi-modal.md
@@ -0,0 +1,37 @@
+---
+title: "Scaling Large-Scale Generative Mixture-of-Expert Multimodal Model With VL-MoE "
+excerpt: ""
+date: 2023-03-31 00:09:00
+tags: training English
+---
+
+The field of Artificial Intelligence-Generated Content (AIGC) is rapidly growing, with the goal of making content creation more efficient and accessible. One of the most exciting areas of AIGC is the development of large-scale multi-modal models like [Flamingo](https://arxiv.org/abs/2204.14198), [BLIP](https://arxiv.org/abs/2301.12597), and [GPT4](https://arxiv.org/abs/2303.08774), which can accept inputs from multiple resources, e.g., image, text, audio, etc., and generate a variety of formats as outputs. For example, image creation can be made through stable diffusion and DALLE using the prompt text, and the new feature in the coming Office can create slides with texts, images, animations, etc., by leveraging the power of the new Microsoft Office Copilot.
+
+Scaling up the model size is one common approach to boost usability and capability of AIGC tasks. However, simply scaling up dense architectures (e.g., from GPT-1 to GPT-3) is usually extremely resource-intensive and time-consuming for both model training and inference. One effective way to tackle this challenge is to apply mixture of experts (MoE). In particular, recent [text-based MoE](https://arxiv.org/abs/2201.05596) and [vision-based MoE](https://arxiv.org/abs/2106.05974) studies have demonstrated that MoE models can significantly reduce the training and resource cost as compared to a quality-equivalent dense model, or produce a higher quality model under the same training budget. Up to now, the effectiveness of jointly training MoE for multi-modal models remains not well understood. To explore this important capability, [DeepSpeed team](https://www.deepspeed.ai/) is proud to announce our first large-scale generative mixture-of-expert (MoE) multimodal model, named [VL-MoE](https://arxiv.org/abs/2303.07226).
+
+[ ![Model architecture](/assets/images/vl_moe.png) ](/assets/images/vl_moe.png){: .align-center}
+
+*Figure 1: The new encoding process in our VL-MoE for various modality inputs, for which gray and colored blocks indicate non-activated and activated modules, respectively.*
+
+Specifically, we incorporate the MoE structure into the classical single-tower multi-modal model by comprising of the following components: (1) a shared self-attention module across modalities, (2) a pool of modality-specific experts in the feed-forward network (FFN), and (3) a sparse gated MoE extended from the dense FFN. Subsequently, under the same amount of training resources as that used in [VLMO](https://arxiv.org/abs/2111.02358) (200k training steps), we demonstrate VL-MoE's advantages over the state-of-the-art dense counterparts in the following two aspects:
+
+(1) **VL-MoE can achieve significant accuracy improvement in comparison to its dense counterparts.** Table 1 demonstrates that under the same training budget (i.e., have the same number of activated parameters for each token), VL-MoE Base with 32 experts achieves better accuracy than the VLMO-Base dense model on all four vision-language datasets.
+
+(2) **VL-MoE achieves similar model quality with a much smaller activated number of parameters compared to its dense counterparts.** Our results show that the finetuning performance of our VL-MoE is similar to that of the 3.1X larger VLMO-Large dense model (i.e., 3.1X more activated number of parameters per token). This can directly translate to approximately 3.1X training cost reduction as the training FLOPs for transformers are proportional to the activated model size per token.
+
+
+
+|                               | Param per Token (# Total Param) |       VQA      |     NLVR2     |     COCO    |  Flickr30K  |
+|                               |                                 | test-dev / std |  dev / test-P |   TR / IR   |   TR / IR   |
+|-------------------------------|:-------------------------------:|:--------------:|:-------------:|:-----------:|:-----------:|
+| Dense Counterparts            |                                 |                |               |             |             |
+| VLMO-dense Base               |           180M (180M)           |  76.64 / 76.89 | 82.77 / 83.34 | 74.8 / 57.2 | 92.3 / 79.3 |
+| VLMO-dense Large              |           560M (180M)           |  79.94 / 79.98 | 85.64 / 86.86 | 78.2 / 60.6 | 95.3 / 84.5 |
+| Ours (VL-MoE with 32 Experts) |                                 |                |               |             |             |
+| VL-MoE                        |           180M (1.9B)           |  78.23 / 78.65 | 85.54 / 86.77 | 79.4 / 61.2 | 96.1 / 84.9 |
+
+*Table 1: Comparison of finetuning accuracy results for different models used in vision-language classification tasks and image-text retrieval tasks.*
+
+A sophisticated MoE model design requires a highly efficient and scalable training system that can support multi-dimensional parallelism and efficient memory management. [DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) training system offers such advanced capabilities including easy-to-use APIs enabling flexible combinations of data, tensor, and expert parallelism. Furthermore, DeepSpeed MoE enables larger model scale than state-of-the-art systems by exploiting expert parallelism and [ZeRO optimizations](https://arxiv.org/abs/1910.02054) together. By leveraging the DeepSpeed MoE system, VL-MoE Base with 32 experts achieves similar model quality as VLMO-dense Large with about 2.5x training speedup.
+
+[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread.
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
new file mode 100644
index 0000000000000000000000000000000000000000..2fd962327b5415a3576c23bafe0c00f7acf86ee0
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: 一键式RLHF训练，让你的类ChatGPT千亿大模型提速省钱15倍"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF Chinese
+---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
new file mode 100644
index 0000000000000000000000000000000000000000..63200846ab65079b32d111b01507e91347439797
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF Japanese
+---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat.md b/docs/_posts/2023-04-24-deepspeed-chat.md
new file mode 100644
index 0000000000000000000000000000000000000000..70b627b951ee043dede5eec8e4574a006784a016
--- /dev/null
+++ b/docs/_posts/2023-04-24-deepspeed-chat.md
@@ -0,0 +1,7 @@
+---
+title: "DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales"
+excerpt: ""
+link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md
+date: 2023-04-24 00:00:00
+tags: training ZeRO RLHF English
+---
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
index 6991d5caf92574c621c2f20758764f76b98df88a..8d6362a4998c803d8c4d8590d0b3ac15c3368af8 100644
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -7,6 +7,7 @@ tags: inference
    * [Introduction](#introduction)
    * [Example Script](#example-script)
         * [Launching](#launching)
+        * [T5 11B Inference Performance Comparison](#t5-11b-inference-performance-comparison)
         * [OPT 13B Inference Performance Comparison](#opt-13b-inference-performance-comparison)
    * [Supported Models](#supported-models)
    * [Unsupported Models](#unsupported-models)
@@ -65,7 +66,7 @@ With automatic tensor parallelism, we do not need to provide the injection polic
 
 # Example Script
 
-We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). The script includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
+We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
 
 
 ## Launching
@@ -83,19 +84,31 @@ To enable tensor parallelism, you need to use the flag `ds_inference` for the co
 deepspeed --num_gpus <num_gpus> DeepSpeedExamples/inference/huggingface/text-generation/inference-test.py --name <model> --batch_size <batch_size> --test_performance --ds_inference
 ```
 
-## OPT 13B Inference Performance Comparison
+## T5 11B Inference Performance Comparison
 
 The following results were collected using V100 SXM2 32GB GPUs.
 
-### Max New Tokens = 50
+### Latency
 
-| Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
-| ---------- | -------------------------- | ---------------- | ------------------------ |
-| No TP      | 23.94 GB                   | 64               | 18.84 TFlops             |
-| 2 GPU TP   | 12.23 GB                   | 320              | 27.17 TFlops             |
-| 4 GPU TP   | 6.36 GB                    | 664              | 27.63 TFlops             |
+![T5 Latency Graph](/assets/images/auto-tp-chart-latency.png){: .align-center}
+
+### Throughput
+
+![T5 Throughput Graph](/assets/images/auto-tp-chart-throughput.png){: .align-center}
+
+### Memory
+
+| Test           | Memory Allocated per GPU   | Max Batch Size | Max Throughput per GPU |
+| -------------- | -------------------------- | -------------- | ---------------------- |
+| No TP or 1 GPU | 21.06 GB                   | 64             | 9.29 TFLOPS            |
+| 2 GPU TP       | 10.56 GB                   | 320            | 13.04 TFLOPS           |
+| 4 GPU TP       | 5.31 GB                    | 768            | 14.04 TFLOPS           |
+
+## OPT 13B Inference Performance Comparison
+
+The following results were collected using V100 SXM2 32GB GPUs.
 
-### Max New Tokens = 1024
+![OPT Throughput Graph](/assets/images/auto-tp-chart-opt-throughput.png){: .align-center}
 
 | Test       | Memory Allocated per GPU   | Max Batch Size   | Max Throughput per GPU   |
 | ---------- | -------------------------- | ---------------- | ------------------------ |
diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
index f7ea8226022e27ed2d3d5cbcfe123bc29526f458..3014be18d682012c1d24b7930989ebcb98b930a4 100755
--- a/docs/_tutorials/bert-finetuning.md
+++ b/docs/_tutorials/bert-finetuning.md
@@ -201,7 +201,7 @@ the `--predict_batch_size` should also be 8.
 
 For further details about the transformer kernel, please see our [usage
 tutorial](/tutorials/transformer_kernel/) and [technical deep
-dive](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) on
+dive](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) on
 the fastest BERT training.
 
 
@@ -302,7 +302,7 @@ Table 4. The setting of memory-optimization flags for a range of micro-batch siz
 
 ### FineTuning model pre-trained with DeepSpeed Transformer Kernels
 
-Fine-tuning the model pre-trained using DeepSpeed Transformer and the recipe in [DeepSpeed Fast-Bert Training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) should yield F1 score of 90.5 and is expected to increase if you let the pre-training longer than suggested in the tutorial.
+Fine-tuning the model pre-trained using DeepSpeed Transformer and the recipe in [DeepSpeed Fast-Bert Training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) should yield F1 score of 90.5 and is expected to increase if you let the pre-training longer than suggested in the tutorial.
 
 To get these results, we do require some tuning of the dropout settings as described below:
 
diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md
index a0943949f9bc57d353f814e154f9ae649b2570d8..cef60540b2320cfdf1818155393bfe44435c0777 100755
--- a/docs/_tutorials/bert-pretraining.md
+++ b/docs/_tutorials/bert-pretraining.md
@@ -130,7 +130,7 @@ The `model` returned by `deepspeed.initialize` is the DeepSpeed _model
 engine_ that we will use to train the model using the forward, backward and
 step API. Since the model engine exposes the same forward pass API as
 `nn.Module` objects, there is no change in the forward pass.
-Thus, we only modify the the backward pass and optimizer/scheduler steps.
+Thus, we only modify the backward pass and optimizer/scheduler steps.
 
 Backward propagation is performed by calling `backward(loss)` directly with
 the model engine.
@@ -308,7 +308,7 @@ Note:
 
 For more details about the transformer kernel, please see [DeepSpeed
 Transformer Kernel](/tutorials/transformer_kernel/) and [DeepSpeed Fast-Bert
-Training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+Training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 
 
 ### Start Training
@@ -391,4 +391,4 @@ for more details in
 
 Compared to SOTA, DeepSpeed significantly improves single GPU performance for transformer-based model like BERT. Figure above shows the single GPU throughput of training BertBERT-Large optimized through DeepSpeed, compared with two well-known Pytorch implementations, NVIDIA BERT and HuggingFace BERT. DeepSpeed reaches as high as 64 and 53 teraflops throughputs (corresponding to 272 and 52 samples/second) for sequence lengths of 128 and 512, respectively, exhibiting up to 28% throughput improvements over NVIDIA BERT and up to 62% over HuggingFace BERT.  We also support up to 1.8x larger batch size without running out of memory.
 
-For more details on how we achieve the record breaking BERT training time please check out deep dive into DeepSpeed BERT [Fastest BERT Training](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
+For more details on how we achieve the record breaking BERT training time please check out deep dive into DeepSpeed BERT [Fastest BERT Training](https://www.deepspeed.ai/2020/05/18/bert-record.html)
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 74ee04502f18d52f0b7786e7f6a84abe80b8d260..8b4990d0431ebc210863f9a310269d417056e0ce 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -8,21 +8,21 @@ If you haven't already, we advise you to first read through the
 [Getting Started](/getting-started/) guide before stepping through this
 tutorial.
 
-In this tutorial we will be adding DeepSpeed to CIFAR-10 model, which is small image classification model.
+In this tutorial we will be adding DeepSpeed to the CIFAR-10 model, which is a small image classification model.
 
-First we will go over how to run original CIFAR-10. Then we will proceed step-by-step in enabling this model to run with DeepSpeed.
+First we will go over how to run the original CIFAR-10 model. Then we will proceed step-by-step in enabling this model to run with DeepSpeed.
 
 
 
 ## Running Original CIFAR-10
 
-Original model code from [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/master/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) and made it available as a submodule. To download, execute:
+Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute:
 
 ```bash
 git submodule update --init --recursive
 ```
 
-To install requirements for CIFAR-10:
+To install the requirements for the CIFAR-10 model:
 ```bash
 cd DeepSpeedExamples/cifar
 pip install -r requirements.txt
@@ -82,14 +82,14 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
 
      parser=argparse.ArgumentParser(description='CIFAR')
 
-     #data
-     # cuda
+     # Data.
+     # Cuda.
      parser.add_argument('--with_cuda', default=False, action='store_true',
                          help='use CPU in case there\'s no GPU support')
      parser.add_argument('--use_ema', default=False, action='store_true',
                          help='whether use exponential moving average')
 
-     # train
+     # Train.
      parser.add_argument('-b', '--batch_size', default=32, type=int,
                          help='mini-batch size (default: 32)')
      parser.add_argument('-e', '--epochs', default=30, type=int,
@@ -97,7 +97,7 @@ The first step to apply DeepSpeed is adding DeepSpeed arguments to CIFAR-10 mode
      parser.add_argument('--local_rank', type=int, default=-1,
                         help='local rank passed from distributed launcher')
 
-     # Include DeepSpeed configuration arguments
+     # Include DeepSpeed configuration arguments.
      parser = deepspeed.add_config_arguments(parser)
 
      args=parser.parse_args()
@@ -123,16 +123,16 @@ def initialize(args,
                collate_fn=None):
 ```
 
-Here we initialize DeepSpeed with CIFAR-10 model (`net`), `args`, `parameters` and `trainset`:
+Here we initialize DeepSpeed with the CIFAR-10 model (`net`), `args`, `parameters` and `trainset`:
 
 ```python
  parameters = filter(lambda p: p.requires_grad, net.parameters())
  args=add_argument()
 
  # Initialize DeepSpeed to use the following features
- # 1) Distributed model
- # 2) Distributed data loader
- # 3) DeepSpeed optimizer
+ # 1) Distributed model.
+ # 2) Distributed data loader.
+ # 3) DeepSpeed optimizer.
  model_engine, optimizer, trainloader, _ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=trainset)
 
 ```
@@ -155,7 +155,7 @@ The `model` returned by `deepspeed.initialize` is the _DeepSpeed Model Engine_ t
 
 ```python
      for i, data in enumerate(trainloader):
-         # get the inputs; data is a list of [inputs, labels]
+         # Get the inputs; data is a list of [inputs, labels].
          inputs = data[0].to(model_engine.device)
          labels = data[1].to(model_engine.device)
 
@@ -206,13 +206,13 @@ The next step to use DeepSpeed is to create a configuration JSON file (ds_config
 
 ### Run CIFAR-10 Model with DeepSpeed Enabled
 
-To start training CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
+To start training the CIFAR-10 model with DeepSpeed applied, execute the following command, it will use all detected GPUs by default.
 
 ```bash
 deepspeed cifar10_deepspeed.py --deepspeed_config ds_config.json
 ```
 
-DeepSpeed usually prints more training details for user to monitor, including training settings, performance statistics and loss trends.
+DeepSpeed usually prints more training details for the user to monitor, including training settings, performance statistics and loss trends.
 ```
 deepspeed.pt cifar10_deepspeed.py --deepspeed_config ds_config.json
 Warning: Permanently added '[192.168.0.22]:42227' (ECDSA) to the list of known hosts.
diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md
index 52d93eda05bcca368859151b7be271ddf5ff9d8f..b6a352b60f6884dad986ac0a945982f30c3429cb 100644
--- a/docs/_tutorials/comms-logging.md
+++ b/docs/_tutorials/comms-logging.md
@@ -46,9 +46,9 @@ There are currently two ways to view communication log records:
 If the `enabled` configuration option is selected, all communication operations will be immediately printed to the console. This mode is intended for detailed debugging, and is not recommended for most users. The following is an example snippet of `verbose` output:
 
 ```
-[2022-06-26 01:39:55,722] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: reduce_scatter_base | time (ms): 9.46 | msg size: 678.86 MB | algbw (Gbps): 1204.52  | busbw (Gbps): 1129.23
-[2022-06-26 01:39:56,470] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.11 | msg size: 6.0 MB | algbw (Gbps): 954.41  | busbw (Gbps): 894.76
-[2022-06-26 01:39:56,471] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_base | time (ms): 0.08 | msg size: 6.0 MB | algbw (Gbps): 1293.47  | busbw (Gbps): 1212.63
+[2022-06-26 01:39:55,722] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: reduce_scatter_tensor | time (ms): 9.46 | msg size: 678.86 MB | algbw (Gbps): 1204.52  | busbw (Gbps): 1129.23
+[2022-06-26 01:39:56,470] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_into_tensor | time (ms): 0.11 | msg size: 6.0 MB | algbw (Gbps): 954.41  | busbw (Gbps): 894.76
+[2022-06-26 01:39:56,471] [INFO] [logging.py:69:log_dist] [Rank 0] rank=0 | comm op: all_gather_into_tensor | time (ms): 0.08 | msg size: 6.0 MB | algbw (Gbps): 1293.47  | busbw (Gbps): 1212.63
 ```
 
 For advanced users, the `debug` option will append the calling function of each communication operation to that operation's `log_name`. See [Log Summaries](#log-summaries) for an example of a `deepspeed.comm.log_summary()` call with `debug` enabled.
@@ -99,7 +99,7 @@ Comm. Op            Message Size        Count               Total Latency(ms)
 broadcast
                     2.0 KB              146                 11.12               0.08                0.43                0.41
                     98.25 MB            1                   8317.12             8317.12             0.20                0.19
-reduce_scatter_base
+reduce_scatter_tensor
                     678.86 MB           40                  602.29              9.69                1468.06             1376.31
 ```
 
@@ -111,6 +111,6 @@ Comm. Op            Message Size        Count               Total Latency(ms)
 broadcast | [Caller Func: _broadcast_model]
                     2.0 KB              146                 9.39                0.06                0.52                0.48
                     98.25 MB            1                   8540.60             8540.60             0.19                0.18
-reduce_scatter_base | [Caller Func: reduce_scatter_fn]
+reduce_scatter_tensor | [Caller Func: reduce_scatter_fn]
                     678.86 MB           80                  1527.17             13.94               1211.75             1136.01
 ```
diff --git a/docs/_tutorials/curriculum-learning.md b/docs/_tutorials/curriculum-learning.md
index 161c29cfc04c1deacb73184dd83abec2484a6fda..817bf622e851276b6e7d576e25d2067266ff3291 100644
--- a/docs/_tutorials/curriculum-learning.md
+++ b/docs/_tutorials/curriculum-learning.md
@@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that
 
 ### 2.3 Token-based training termination
 
-Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
+Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use  a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
 
 ### 2.4 Token-based LR decay
 
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 169bfb18d4422223ff2f46efa6be2104b5966940..24efc238615a0684bbfb6ec862844c0664b7b80b 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -316,8 +316,9 @@ The following example shows how to profile AlexNet using the DeepSpeed flops pro
 import torchvision.models as models
 import torch
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     model = models.alexnet()
     batch_size = 256
     flops, macs, params = get_model_profile(model=model, # model
@@ -341,6 +342,7 @@ from functools import partial
 import torch
 from transformers import BertForSequenceClassification, BertTokenizer
 from deepspeed.profiling.flops_profiler import get_model_profile
+from deepspeed.accelerator import get_accelerator
 
 
 def bert_input_constructor(batch_size, seq_len, tokenizer):
@@ -357,7 +359,7 @@ def bert_input_constructor(batch_size, seq_len, tokenizer):
     return inputs
 
 
-with torch.cuda.device(0):
+with get_accelerator().device(0):
     tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
     batch_size = 4
diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md
index 176662296ad9d371a08c22b347366db883125007..411d5e756504a3cd2ee92b942ff473537cdaa559 100644
--- a/docs/_tutorials/inference-tutorial.md
+++ b/docs/_tutorials/inference-tutorial.md
@@ -132,7 +132,7 @@ Below is an output of the generated text.  You can try other prompt and see how
 
 ## Datatypes and Quantized Models
 
-DeepSpeed inference supports fp32, fp16 and int8 parameters. The appropriate datatype can be set using dtype in `init_inference`, and DeepSpeed will choose the kernels optimized for that datatype. For quantized int8 models, if the model was quantized using DeepSpeed's quantization approach ([MoQ](https://www.deepspeed.ai/news/2020/05/27/MoQ.html)), the setting by which the quantization is applied needs to be passed to `init_inference`. This setting includes the number of groups used for quantization and whether the MLP part of transformer is quantized with extra grouping. For more information on these parameters, please visit our [quantization tutorial](https://www.deepspeed.ai/tutorials/MoQ-tutorial/).
+DeepSpeed inference supports fp32, fp16 and int8 parameters. The appropriate datatype can be set using dtype in `init_inference`, and DeepSpeed will choose the kernels optimized for that datatype. For quantized int8 models, if the model was quantized using DeepSpeed's quantization approach ([MoQ](https://www.deepspeed.ai/2021/05/04/MoQ.html)), the setting by which the quantization is applied needs to be passed to `init_inference`. This setting includes the number of groups used for quantization and whether the MLP part of transformer is quantized with extra grouping. For more information on these parameters, please visit our [quantization tutorial](https://www.deepspeed.ai/tutorials/MoQ-tutorial/).
 
 ```python
 import deepspeed
diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md
index 2977f577302946e799a0ecd143286d6f189d3251..0ccfd3ec02f13a4bdea0cb04e955ee301bb811d9 100644
--- a/docs/_tutorials/megatron.md
+++ b/docs/_tutorials/megatron.md
@@ -275,7 +275,7 @@ DeepSpeed's `save_checkpoint()`.
          sd['random_rng_state'] = random.getstate()
          sd['np_rng_state'] = np.random.get_state()
          sd['torch_rng_state'] = torch.get_rng_state()
-         sd['cuda_rng_state'] = torch.cuda.get_rng_state()
+         sd['cuda_rng_state'] = get_accelerator().get_rng_state()
          sd['rng_tracker_states'] = mpu.get_cuda_rng_tracker().get_states()
 
      model.save_checkpoint(args.save, iteration, client_state = sd)
diff --git a/docs/_tutorials/mixture-of-experts.md b/docs/_tutorials/mixture-of-experts.md
index 23d807ab3eb15cdc94d7fe026baeff40a609000c..e7739a6a5051ac3758218d3c5d9ebff3ab1225e2 100644
--- a/docs/_tutorials/mixture-of-experts.md
+++ b/docs/_tutorials/mixture-of-experts.md
@@ -63,7 +63,7 @@ Updated with MoE Layers
 
 ### Pyramid-Residual MoE
 
-Recently, we proposed a novel [Pyramid-Residual MoE](https://arxiv.org/abs/2201.05596]) (PR-MoE) model architecture. To create such an MoE model, the users need to do two additional things: 1) To make a pyramid structure, pass num_experts as a list e.g. [4, 8] and 2) Use the ```use_residual``` flag to indicate that the MoE layer is now a Residual MoE layer.
+Recently, we proposed a novel [Pyramid-Residual MoE](https://arxiv.org/abs/2201.05596) (PR-MoE) model architecture. To create such an MoE model, the users need to do two additional things: 1) To make a pyramid structure, pass num_experts as a list e.g. [4, 8] and 2) Use the ```use_residual``` flag to indicate that the MoE layer is now a Residual MoE layer.
 
 ```python
 self.experts = deepspeed.moe.layer.MoE(hidden_size=input_dim, expert=ExpertModule(), num_experts=[..], ep_size=ep_size, use_residual=True)
@@ -165,4 +165,4 @@ We have devised a new technique called “Random Token Selection” that greatly
 
 ## Advanced MoE usage
 
-We have added an example of applying MoE to NLG models. Please read more in this [newsletter](https://www.deepspeed.ai/news/2021/12/09/deepspeed-moe-nlg.html) and [tutorial](/tutorials/mixture-of-experts-nlg/).
+We have added an example of applying MoE to NLG models. Please read more in this [newsletter](https://www.deepspeed.ai/2021/12/09/deepspeed-moe-nlg.html) and [tutorial](/tutorials/mixture-of-experts-nlg/).
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 5166869ebe996447b3e5ec7fddc3440fa02e26b0..a64439018db4d60bccffd674b5d7217d1269bf94 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -16,7 +16,7 @@ This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes inc
 1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently the MPI-based implementation is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
 {: .notice--warning}
 
-In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
+In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
 
 To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
 
@@ -77,7 +77,7 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
 
 ### 1.3 1-bit Algorithm
 
-The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
 ### 1.4 Configuration of 1-bit Adam
 The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
@@ -215,7 +215,7 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor
 
 Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
 
-Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
 
 
 
@@ -295,4 +295,4 @@ The above file is for BERT-large. For BERT-base training (sequence length 128),
 
 ### 3.3 Performance Results for BERT Pre-training
 
-Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
diff --git a/docs/_tutorials/progressive_layer_dropping.md b/docs/_tutorials/progressive_layer_dropping.md
index b7b868bf29d35d30e736adacefdcfdbaa32159f2..b3da9ba807a372e60b5f5cf8ca40e1d746158899 100755
--- a/docs/_tutorials/progressive_layer_dropping.md
+++ b/docs/_tutorials/progressive_layer_dropping.md
@@ -95,7 +95,7 @@ Note that the above configuration assumes training on 64 X 32GB V100 GPUs. Each
 
 Table 1. Pre-training hyperparameters
 
-**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
+**Note:** DeepSpeed now supports PreLayerNorm as the default way for training BERT, because of its ability to avoid vanishing gradient, stabilize optimization, and performance gains, as described in our fastest BERT training [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html). We therefore support the switchable Transformer block directly on the the BERT with PreLayerNorm. The implementation can be found at "example\bing_bert\nvidia\modelingpreln_layerdrop.py".
 
 ## Fine-tuning with DeepSpeed on GLUE Tasks
 
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
index d28b2d1ff33ca1be6895993ec3435d9bb785cf36..bbcf0c510c6bfb5512c82af3a0f50e4a8511b44d 100644
--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -9,7 +9,7 @@ In this tutorial we describe how to use DeepSpeed Sparse Attention (SA) and its
 {: .notice--warning}
 
 ## Sparse attention modules
-* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html) section.
+* **MatMul**: This module handles block-sparse matrix-matrix multiplication. Currently it supports SDD, DSD, and DDS as described in [DeepSpeed Sparse Attention](https://www.deepspeed.ai/2020/09/08/sparse-attention.html) section.
 * **Softmax**: This module applies block sparse softmax. It handles both forward and backward pass.
 * **SparseSelfAttention**: This module uses MatMul and Softmax kernels and generates Context Layer output given Query, Keys and Values. It is a simplified version of common operations in any self-attention layer. It can also apply:
   * `Relative position embedding`
diff --git a/docs/_tutorials/transformer_kernel.md b/docs/_tutorials/transformer_kernel.md
index 915117fc3af987e2edca9ca5d4d0be8ccf1f30d7..959f5b7d41f9d0d7b03e173b22db820d7cf4f2e8 100755
--- a/docs/_tutorials/transformer_kernel.md
+++ b/docs/_tutorials/transformer_kernel.md
@@ -14,7 +14,7 @@ To this end, we have developed a new kernel for transformer networks which inclu
 optimizations specific to these layers, which boost the training throughput on single GPU and scales
 well as we increase the number of GPUs. For more information on the details
 of transformer kernel, please visit our recent blog post on the [fastest BERT
-training](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html).
+training](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 
 ## Prerequisites
 
@@ -96,7 +96,7 @@ By setting the `normalize_invertible` flag, we force the kernel to drop the inpu
 
 The `attn_dropout_checkpoint` and `gelu_checkpoint` flags refer to the checkpointing approach, in which we drop the inputs to some parts of the transformer layer, attention dropout and Gelu, in order to save an important part of the activation memory. Based on our performance profiling, the performance cost of rematerializing these two are negligible and finally the performance benefit that we gain from running larger batch size compensate for that.
 
-The following table shows which memory optimization flags need to be turned on when running BERT-Large on NVIDIA V100 GPU with 32GB of memory, considering different micro-batch sizes and sequence lengths. For the two sequence lengths, 128 and 512, used in our experiments, we have seen that larger batch size improves the overall training performance for both. Please see our [blog post](https://www.deepspeed.ai/news/2020/05/27/fastest-bert-training.html) for more information regarding the performance evaluation of these configurations.
+The following table shows which memory optimization flags need to be turned on when running BERT-Large on NVIDIA V100 GPU with 32GB of memory, considering different micro-batch sizes and sequence lengths. For the two sequence lengths, 128 and 512, used in our experiments, we have seen that larger batch size improves the overall training performance for both. Please see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html) for more information regarding the performance evaluation of these configurations.
 
 | Micro-batch size |    128 sequence-length    |           512 sequence-length            |
 | :--------------: | :-----------------------: | :--------------------------------------: |
diff --git a/docs/assets/images/auto-tp-chart-latency.png b/docs/assets/images/auto-tp-chart-latency.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c883a2a450340627b5ad7262f9fae8d95bce75f
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-latency.png differ
diff --git a/docs/assets/images/auto-tp-chart-opt-throughput.png b/docs/assets/images/auto-tp-chart-opt-throughput.png
new file mode 100644
index 0000000000000000000000000000000000000000..4180a6b063d9ae171fc7d38b0f8f374a0723f365
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-opt-throughput.png differ
diff --git a/docs/assets/images/auto-tp-chart-throughput.png b/docs/assets/images/auto-tp-chart-throughput.png
new file mode 100644
index 0000000000000000000000000000000000000000..518bcbd8e4390ae44cd2a6aa26f0f36756239103
Binary files /dev/null and b/docs/assets/images/auto-tp-chart-throughput.png differ
diff --git a/docs/assets/images/vl_moe.png b/docs/assets/images/vl_moe.png
new file mode 100644
index 0000000000000000000000000000000000000000..5c58e89b54ce5a80dbb700df8f3b8ca74c807c01
Binary files /dev/null and b/docs/assets/images/vl_moe.png differ
diff --git a/docs/code-docs/source/conf.py b/docs/code-docs/source/conf.py
index 059aa7c0b67c60a6fd2f579563eabd9a7399f24f..64bfc272225214eb4fe10302a0bf181ab5ba62fc 100644
--- a/docs/code-docs/source/conf.py
+++ b/docs/code-docs/source/conf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # Configuration file for the Sphinx documentation builder.
 #
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 60c3470185896371e85041d23399bca731f8b143..8f43f9faa827feb4746a047e5d2eab1291bc1eab 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -155,7 +155,39 @@ Example ZeRO-3 Configurations
             ...
         }
 
+MiCS Configurations
+===================
+
+All MiCS configurations are set with `DeepSpeedZeroConfig`. MiCS assumes ZeRO
+stage 3 optimization is enabled. For now, there are two configuration fields of
+MiCS `mics_shard_size` and `mics_hierarchical_params_gather`. `mics_shard_size`
+controls how many devices are used for partitioning the model states.
+`mics_hierarchical_params_gather` controls whether we use a two-stage
+hierarchical way to gather parameters in the forward computation.
+`mics_hierarchical_params_gather` is useful when model states are partitioned
+across multiple nodes and the cross-node bandwidth is slow. By default this is
+turned off.
+
+
+Example MiCS Configurations
+===========================
 
+#. Use MiCS to partition the model states (including optimizer states,
+   gradients, and parameters). The following config example partitions the model
+   states to eight devices, and assumes the eight devices are located within a
+   single node (`mics_hierarchical_params_gather` is `False`).
+
+    .. code-block:: python
+        :emphasize-lines: 3
+
+        {
+            "zero_optimization": {
+                "stage": 3,
+                "mics_shard_size": 8,
+                "mics_hierarchical_params_gather": False,
+            },
+            ...
+        }
 
 Assumptions
 ===========
@@ -331,3 +363,29 @@ These routines can be used in a training loop as shown in the following snippet.
 
     [...]
     optimizer.step()
+
+
+GPU Memory Management
+---------------------
+
+By default at the end of training with ZeRO stage 3 some parameters could remain unpartitioned and use up some gpu memory.
+This is done on purpose as an optimization should you resume training again. If you'd like to clear out the cached
+parameters that use up gpu memory, you can call ``empty_partition_cache`` method of a DeepSpeed engine.
+
+.. autofunction::deepspeed.DeepSpeedEngine.empty_partition_cache
+
+The following code snippet illustrates this functionality.
+
+.. code-block:: python
+
+    with zero.Init():
+        model = MyLargeModel()
+
+    ds_engine, _, _, _ = deepspeed.initialize(model, ...)
+    for batch in ...:
+        loss = ds_engine(batch)
+        ds_engine.backward(batch)
+        ds_engine.step()
+
+    # Free GPU memory consumed by model parameters
+    ds_engine.empty_partition_cache()
diff --git a/docs/index.md b/docs/index.md
index 79c5ad21f058c9249c4345bc9330f6f58eb930b5..e35affae5dccc9a12cb06de0fc6423a2c05cb612 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,9 +5,11 @@ toc_label: "Contents"
 title: "Latest News"
 
 ---
-<b> DeepSpeed trained the world's most powerful language models ([MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)); [learn how](https://www.deepspeed.ai/tutorials/large-models-w-deepspeed/).</b>
+<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without providing an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
+* [2023/04] 🚀 [DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/japanese/README.md)]🚀
+* [2023/03] [Scaling Large-Scale Generative Mixture-of-Expert Multimodal Model With VL-MoE](https://www.deepspeed.ai/2023/03/30/multi-modal.html)
+* [2023/02] [Automatic Tensor Parallelism: Enables tensor parallelism by default without an injection policy](https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism/)
 * [2022/12] [DeepSpeed Data Efficiency: A composable library that makes better use of data, increases training efficiency, and improves model quality](https://www.deepspeed.ai/2022/12/11/data-efficiency.html)
 * [2022/11] [Stable Diffusion Image Generation under 1 second w. DeepSpeed MII](https://github.com/microsoft/DeepSpeed-MII/tree/main/examples/benchmark/txt2img)
 * [2022/10] [DeepSpeed-MII: instant speedup on 24,000+ open-source DL models with up to 40x cheaper inference](https://www.deepspeed.ai/2022/10/10/mii.html)
@@ -17,7 +19,7 @@ title: "Latest News"
 
 # Extreme Speed and Scale for DL Training and Inference
 
-   DeepSpeed is an easy-to-use deep learning optimization software suite that enables unprecedented scale and speed for Deep Learning Training and Inference. With DeepSpeed you can:
+   ***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can:
 
 * Train/Inference dense or sparse models with billions or trillions of parameters
 * Achieve excellent system throughput and efficiently scale to thousands of GPUs
@@ -123,6 +125,10 @@ comments.
 14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946).
 15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
 16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597).
+17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017).
+18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
+19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226).
+20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/docs/news/index.html b/docs/news/index.html
deleted file mode 100644
index 95e7974b5050239a6581f83d8ce6f8ab57f40ca7..0000000000000000000000000000000000000000
--- a/docs/news/index.html
+++ /dev/null
@@ -1,3 +0,0 @@
----
-layout: news-home
----
diff --git a/env.sh b/env.sh
deleted file mode 100644
index a632541512d7a009605a680e9265d501812fb244..0000000000000000000000000000000000000000
--- a/env.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-export LD_LIBRARY_PATH=/public/home/aishsh/env/anaconda3/envs/oneflow_test/lib/:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/public/home/aishsh/env/anaconda3/envs/oneflow_test/lib/:$LIBRARY_PATH
diff --git a/examples/README.md b/examples/README.md
index c61ee047df9aed1391d98fbc665aae41c75469f8..5dfc26c1761353bf18992067fb04192fe944e820 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -4,6 +4,6 @@ If you are looking for examples using DeepSpeed please see the following resourc
 
 1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
 2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
-3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
+3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
 4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
-5. [DeepSpeed + PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/api/pytorch_lightning.utilities.deepspeed.html)
+5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
diff --git a/hipify_20230511113250.png b/hipify_20230511113250.png
deleted file mode 100755
index f3787d84b6d7469b826fcf6b45ae446fe0edc076..0000000000000000000000000000000000000000
Binary files a/hipify_20230511113250.png and /dev/null differ
diff --git a/install.sh b/install.sh
index 6770924d1ef8b5784d77b0f950acba9856377c27..8d56afe404939efca9b8c7750909ea2d41a95947 100755
--- a/install.sh
+++ b/install.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -e
 err_report() {
@@ -121,7 +121,7 @@ rm_if_exist() {
     if [ -f $1 ]; then
         rm $VERBOSE $1
     elif [ -d $1 ]; then
-        rm -r $VERBOSE $1
+        rm -rf $VERBOSE $1
     fi
 }
 
diff --git a/op_builder/__init__.py b/op_builder/__init__.py
index 51546be6ccaf7b39e969f416b28bb472b8d6442c..2c55662df8ce2f65af88125514a0460fac31f845 100644
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import sys
 import os
 import pkgutil
@@ -8,6 +10,9 @@ import importlib
 
 from .builder import get_default_compute_capabilities, OpBuilder
 
+# Do not remove, required for abstract accelerator to detect if we have a deepspeed or 3p op_builder
+__deepspeed__ = True
+
 # List of all available op builders from deepspeed op_builder
 try:
     import deepspeed.ops.op_builder  # noqa: F401
@@ -42,9 +47,7 @@ for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(this_module.__fil
     if module_name != 'all_ops' and module_name != 'builder':
         module = importlib.import_module(f".{module_name}", package=op_builder_dir)
         for member_name in module.__dir__():
-            if member_name.endswith(
-                    'Builder'
-            ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
+            if member_name.endswith('Builder') and member_name != "OpBuilder" and member_name != "CUDAOpBuilder":
                 # assign builder name to variable with same name
                 # the following is equivalent to i.e. TransformerBuilder = "TransformerBuilder"
                 this_module.__dict__[member_name] = builder_closure(member_name)
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index a6010addb60bb5c13aec5a7686297bd709e276dd..9c41f35eaf1b8c94f5d313e8f8b381bed48536f2 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import pkgutil
 import importlib
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 95cecdbede453d4009594cdae0772492ec98905c..810d758f15e0229d94f7387f36bca351ebbbddab 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import distutils.spawn
 import subprocess
 
@@ -19,14 +21,10 @@ class AsyncIOBuilder(OpBuilder):
 
     def sources(self):
         return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
-            'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
-            'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp',
-            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
 
@@ -52,21 +50,14 @@ class AsyncIOBuilder(OpBuilder):
         ]
 
     def extra_ldflags(self):
-        #aiss
         #return ['-laio']
         return ['-laio', '-liomp5']
 
     def check_for_libaio_pkg(self):
         libs = dict(
-            dpkg=["-l",
-                  "libaio-dev",
-                  "apt"],
-            pacman=["-Q",
-                    "libaio",
-                    "pacman"],
-            rpm=["-q",
-                 "libaio-devel",
-                 "yum"],
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
         )
 
         found = False
@@ -75,15 +66,11 @@ class AsyncIOBuilder(OpBuilder):
             path = distutils.spawn.find_executable(pkgmgr)
             if path is not None:
                 cmd = f"{pkgmgr} {flag} {lib}"
-                result = subprocess.Popen(cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=subprocess.PIPE,
-                                          shell=True)
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
                 if result.wait() == 0:
                     found = True
                 else:
-                    self.warning(
-                        f"{self.NAME}: please install the {lib} package with {tool}")
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
                 break
         return found
 
@@ -95,9 +82,7 @@ class AsyncIOBuilder(OpBuilder):
         # respectively to specify the directories for libaio.h and libaio.so.
         aio_compatible = self.has_function('io_submit', ('aio', ))
         if verbose and not aio_compatible:
-            self.warning(
-                f"{self.NAME} requires the dev libaio .so object and headers but these were not found."
-            )
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 
             # Check for the libaio package via known package managers
             # to print suggestions on which package to install.
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 96bf723c49259ad9d10238b570396b33c5773004..1ffd43012180ebdf9ff1273a6da7d43afec4dabc 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import sys
 import time
@@ -27,25 +29,18 @@ DEFAULT_COMPUTE_CAPABILITIES = "6.0;6.1;7.0"
 try:
     import torch
 except ImportError:
-    print(
-        f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
-    )
+    print(f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops.")
 else:
     TORCH_MAJOR = int(torch.__version__.split('.')[0])
     TORCH_MINOR = int(torch.__version__.split('.')[1])
 
 
 def installed_cuda_version(name=""):
-    import torch.cuda
-    if not torch.cuda.is_available():
-        return 0, 0
     import torch.utils.cpp_extension
     cuda_home = torch.utils.cpp_extension.CUDA_HOME
     assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
     # Ensure there is not a cuda version mismatch between torch and nvcc compiler
-    output = subprocess.check_output([cuda_home + "/bin/nvcc",
-                                      "-V"],
-                                     universal_newlines=True)
+    output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], universal_newlines=True)
     output_split = output.split()
     release_idx = output_split.index("release")
     release = output_split[release_idx + 1].replace(',', '').split(".")
@@ -57,8 +52,7 @@ def installed_cuda_version(name=""):
 def get_default_compute_capabilities():
     compute_caps = DEFAULT_COMPUTE_CAPABILITIES
     import torch.utils.cpp_extension
-    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version(
-    )[0] >= 11:
+    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
         if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
             # Special treatment of CUDA 11.0 because compute_86 is not supported.
             compute_caps += ";8.0"
@@ -75,37 +69,25 @@ cuda_minor_mismatch_ok = {
         "10.1",
         "10.2",
     ],
-    11: ["11.0",
-         "11.1",
-         "11.2",
-         "11.3",
-         "11.4",
-         "11.5",
-         "11.6",
-         "11.7",
-         "11.8"],
+    11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
 }
 
 
 def assert_no_cuda_mismatch(name=""):
     cuda_major, cuda_minor = installed_cuda_version(name)
-    if cuda_minor == 0 and cuda_major == 0:
-        return False
     sys_cuda_version = f'{cuda_major}.{cuda_minor}'
     torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     # This is a show-stopping error, should probably not proceed past this
     if sys_cuda_version != torch_cuda_version:
-        if (cuda_major in cuda_minor_mismatch_ok
-                and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
+        if (cuda_major in cuda_minor_mismatch_ok and sys_cuda_version in cuda_minor_mismatch_ok[cuda_major]
                 and torch_cuda_version in cuda_minor_mismatch_ok[cuda_major]):
             print(f"Installed CUDA version {sys_cuda_version} does not match the "
                   f"version torch was compiled with {torch.version.cuda} "
                   "but since the APIs are compatible, accepting this combination")
             return True
-        raise Exception(
-            f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
-            f"version torch was compiled with {torch.version.cuda}, unable to compile "
-            "cuda/cpp extensions without a matching cuda version.")
+        raise Exception(f">- DeepSpeed Op Builder: Installed CUDA version {sys_cuda_version} does not match the "
+                        f"version torch was compiled with {torch.version.cuda}, unable to compile "
+                        "cuda/cpp extensions without a matching cuda version.")
     return True
 
 
@@ -142,12 +124,11 @@ class OpBuilder(ABC):
         install_torch_version = torch_info['version']
         current_torch_version = ".".join(torch.__version__.split('.')[:2])
         if install_torch_version != current_torch_version:
-            raise RuntimeError(
-                "PyTorch version mismatch! DeepSpeed ops were compiled and installed "
-                "with a different version than what is being used at runtime. "
-                f"Please re-install DeepSpeed or switch torch versions. "
-                f"Install torch version={install_torch_version}, "
-                f"Runtime torch version={current_torch_version}")
+            raise RuntimeError("PyTorch version mismatch! DeepSpeed ops were compiled and installed "
+                               "with a different version than what is being used at runtime. "
+                               f"Please re-install DeepSpeed or switch torch versions. "
+                               f"Install torch version={install_torch_version}, "
+                               f"Runtime torch version={current_torch_version}")
 
     @staticmethod
     def validate_torch_op_version(torch_info):
@@ -155,22 +136,20 @@ class OpBuilder(ABC):
             current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
             install_cuda_version = torch_info['cuda_version']
             if install_cuda_version != current_cuda_version:
-                raise RuntimeError(
-                    "CUDA version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Install CUDA version={install_cuda_version}, "
-                    f"Runtime CUDA version={current_cuda_version}")
+                raise RuntimeError("CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install CUDA version={install_cuda_version}, "
+                                   f"Runtime CUDA version={current_cuda_version}")
         else:
             current_hip_version = ".".join(torch.version.hip.split('.')[:2])
             install_hip_version = torch_info['hip_version']
             if install_hip_version != current_hip_version:
-                raise RuntimeError(
-                    "HIP version mismatch! DeepSpeed ops were compiled and installed "
-                    "with a different version than what is being used at runtime. "
-                    f"Please re-install DeepSpeed or switch torch versions. "
-                    f"Install HIP version={install_hip_version}, "
-                    f"Runtime HIP version={current_hip_version}")
+                raise RuntimeError("HIP version mismatch! DeepSpeed ops were compiled and installed "
+                                   "with a different version than what is being used at runtime. "
+                                   f"Please re-install DeepSpeed or switch torch versions. "
+                                   f"Install HIP version={install_hip_version}, "
+                                   f"Runtime HIP version={current_hip_version}")
 
     @staticmethod
     def is_rocm_pytorch():
@@ -184,8 +163,7 @@ class OpBuilder(ABC):
             pass
         else:
             if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
-                _is_rocm_pytorch = hasattr(torch.version,
-                                           'hip') and torch.version.hip is not None
+                _is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
                 if _is_rocm_pytorch:
                     from torch.utils.cpp_extension import ROCM_HOME
                     _is_rocm_pytorch = ROCM_HOME is not None
@@ -240,7 +218,6 @@ class OpBuilder(ABC):
         return True
 
     def extra_ldflags(self):
-        #aiss
         #return []
         return ['-liomp5']
 
@@ -248,10 +225,7 @@ class OpBuilder(ABC):
         valid = False
         check_cmd = 'dpkg -l'
         for lib in libraries:
-            result = subprocess.Popen(f'dpkg -l {lib}',
-                                      stdout=subprocess.PIPE,
-                                      stderr=subprocess.PIPE,
-                                      shell=True)
+            result = subprocess.Popen(f'dpkg -l {lib}', stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
             valid = valid or result.wait() == 0
         return valid
 
@@ -282,9 +256,7 @@ class OpBuilder(ABC):
             tempdir = tempfile.mkdtemp()
 
             # Define a simple C program that calls the function in question
-            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (
-                funcname,
-                funcname)
+            prog = "void %s(void); int main(int argc, char** argv) { %s(); return 0; }" % (funcname, funcname)
 
             # Write the test program to a file.
             filename = os.path.join(tempdir, 'test.c')
@@ -305,16 +277,13 @@ class OpBuilder(ABC):
 
             # Attempt to compile the C program into an object file.
             cflags = shlex.split(os.environ.get('CFLAGS', ""))
-            objs = compiler.compile([filename],
-                                    output_dir=output_dir,
-                                    extra_preargs=self.strip_empty_entries(cflags))
+            objs = compiler.compile([filename], output_dir=output_dir, extra_preargs=self.strip_empty_entries(cflags))
 
             # Attempt to link the object file into an executable.
             # Be sure to tack on any libraries that have been specified.
             ldflags = shlex.split(os.environ.get('LDFLAGS', ""))
             compiler.link_executable(objs,
-                                     os.path.join(tempdir,
-                                                  'a.out'),
+                                     os.path.join(tempdir, 'a.out'),
                                      extra_preargs=self.strip_empty_entries(ldflags),
                                      libraries=libraries)
 
@@ -358,9 +327,8 @@ class OpBuilder(ABC):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return "-march=native"
@@ -372,23 +340,23 @@ class OpBuilder(ABC):
 
     def is_cuda_enable(self):
         try:
+            #assert_no_cuda_mismatch(self.name)
+            #return '-D__ENABLE_CUDA__'
+            #aiss
             if torch.cuda.is_available():
                 return '-D__ENABLE_CUDA__'
-        except:
-            print(
-                f"{WARNING} {self.name} torch.cuda is missing, only cpu ops can be compiled!"
-            )
+        except BaseException:
+            print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
+                  "only cpu ops can be compiled!")
             return '-D__DISABLE_CUDA__'
         return '-D__DISABLE_CUDA__'
 
     def _backup_cpuinfo(self):
         # Construct cpu_info dict from lscpu that is similar to what py-cpuinfo provides
         if not self.command_exists('lscpu'):
-            self.warning(
-                f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
-                "to detect the CPU architecture. 'lscpu' does not appear to exist on "
-                "your system, will fall back to use -march=native and non-vectorized execution."
-            )
+            self.warning(f"{self.name} attempted to query 'lscpu' after failing to use py-cpuinfo "
+                         "to detect the CPU architecture. 'lscpu' does not appear to exist on "
+                         "your system, will fall back to use -march=native and non-vectorized execution.")
             return None
         result = subprocess.check_output('lscpu', shell=True)
         result = result.decode('utf-8').strip().lower()
@@ -420,9 +388,8 @@ class OpBuilder(ABC):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(
-                f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
+                         "falling back to `lscpu` to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return '-D__SCALAR__'
@@ -445,13 +412,9 @@ class OpBuilder(ABC):
             valid = valid or result.wait() == 0
 
         if not valid and len(cmds) > 1:
-            print(
-                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
-            )
+            print(f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!")
         elif not valid and len(cmds) == 1:
-            print(
-                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
-            )
+            print(f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!")
         return valid
 
     def warning(self, msg):
@@ -466,12 +429,11 @@ class OpBuilder(ABC):
 
     def builder(self):
         from torch.utils.cpp_extension import CppExtension
-        return CppExtension(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
-            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.strip_empty_entries(self.sources()),
+                            include_dirs=self.strip_empty_entries(self.include_paths()),
+                            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+                            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
     def load(self, verbose=True):
         from deepspeed.git_version_info import installed_ops, torch_info
@@ -480,9 +442,8 @@ class OpBuilder(ABC):
             # torch/cuda versions we are currently using at runtime.
             self.validate_torch_version(torch_info)
             if torch.cuda.is_available() and isinstance(self, CUDAOpBuilder):
-#aiss HIP version mismatch error
-                #self.validate_torch_op_version(torch_info)
-                pass   
+                self.validate_torch_op_version(torch_info)
+
             return importlib.import_module(self.absolute_name())
         else:
             return self.jit_load(verbose)
@@ -495,21 +456,21 @@ class OpBuilder(ABC):
         try:
             import ninja  # noqa: F401
         except ImportError:
-            raise RuntimeError(
-                f"Unable to JIT load the {self.name} op due to ninja not being installed."
-            )
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
 
         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
-            self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+            try:
+                assert_no_cuda_mismatch(self.name)
+                self.build_for_cpu = False
+            except BaseException:
+                self.build_for_cpu = True
 
         self.jit_mode = True
         from torch.utils.cpp_extension import load
 
         start_build = time.time()
         sources = [self.deepspeed_src_path(path) for path in self.sources()]
-        extra_include_paths = [
-            self.deepspeed_src_path(path) for path in self.include_paths()
-        ]
+        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
 
         # Torch will try and apply whatever CCs are in the arch list at compile time,
         # we have already set the intended targets ourselves we know that will be
@@ -520,14 +481,13 @@ class OpBuilder(ABC):
             torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
             os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
-        op_module = load(
-            name=self.name,
-            sources=self.strip_empty_entries(sources),
-            extra_include_paths=self.strip_empty_entries(extra_include_paths),
-            extra_cflags=self.strip_empty_entries(self.cxx_args()),
-            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
-            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
-            verbose=verbose)
+        op_module = load(name=self.name,
+                         sources=self.strip_empty_entries(sources),
+                         extra_include_paths=self.strip_empty_entries(extra_include_paths),
+                         extra_cflags=self.strip_empty_entries(self.cxx_args()),
+                         extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
+                         extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+                         verbose=verbose)
 
         build_duration = time.time() - start_build
         if verbose:
@@ -541,6 +501,7 @@ class OpBuilder(ABC):
 
 
 class CUDAOpBuilder(OpBuilder):
+
     def compute_capability_args(self, cross_compile_archs=None):
         """
         Returns nvcc compute capability compile flags.
@@ -587,8 +548,7 @@ class CUDAOpBuilder(OpBuilder):
         ccs = self.filter_ccs(ccs)
         if len(ccs) == 0:
             raise RuntimeError(
-                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering"
-            )
+                f"Unable to load {self.name} op due to no compute capabilities remaining after filtering")
 
         args = []
         for cc in ccs:
@@ -623,7 +583,16 @@ class CUDAOpBuilder(OpBuilder):
         return super().is_compatible(verbose)
 
     def builder(self):
-        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
+        #try:
+        #    assert_no_cuda_mismatch(self.name)
+        #    self.build_for_cpu = False
+        #except BaseException:
+        #    self.build_for_cpu = True
+
+        #if self.build_for_cpu:
+        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        #else:
+        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
         #aiss
         if not self.is_rocm_pytorch():
             self.build_for_cpu = not assert_no_cuda_mismatch(self.name) 
@@ -632,23 +601,17 @@ class CUDAOpBuilder(OpBuilder):
             else:
                 from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
         else:
-            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder    
-        #self.build_for_cpu = not assert_no_cuda_mismatch(self.name)
-        #if self.build_for_cpu:
-        #    from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
-        #else:
-        #    from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
+            from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder 
 
         compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
                        {'cxx': self.strip_empty_entries(self.cxx_args()), \
                            'nvcc': self.strip_empty_entries(self.nvcc_args())}
 
-        cuda_ext = ExtensionBuilder(
-            name=self.absolute_name(),
-            sources=self.strip_empty_entries(self.sources()),
-            include_dirs=self.strip_empty_entries(self.include_paths()),
-            libraries=self.strip_empty_entries(self.libraries_args()),
-            extra_compile_args=compile_args)
+        cuda_ext = ExtensionBuilder(name=self.absolute_name(),
+                                    sources=self.strip_empty_entries(self.sources()),
+                                    include_dirs=self.strip_empty_entries(self.include_paths()),
+                                    libraries=self.strip_empty_entries(self.libraries_args()),
+                                    extra_compile_args=compile_args)
 
         if self.is_rocm_pytorch():
             # hip converts paths to absolute, this converts back to relative
@@ -656,7 +619,10 @@ class CUDAOpBuilder(OpBuilder):
             curr_file = Path(__file__).parent.parent  # ds root
             for i in range(len(sources)):
                 src = Path(sources[i])
-                sources[i] = str(src.relative_to(curr_file))
+                if src.is_absolute():
+                    sources[i] = str(src.relative_to(curr_file))
+                else:
+                    sources[i] = str(src)
             cuda_ext.sources = sources
         return cuda_ext
 
@@ -711,9 +677,7 @@ class CUDAOpBuilder(OpBuilder):
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
             args += [
-                '-std=c++14',
-                '-U__HIP_NO_HALF_OPERATORS__',
-                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-std=c++14', '-U__HIP_NO_HALF_OPERATORS__', '-U__HIP_NO_HALF_CONVERSIONS__',
                 '-U__HIP_NO_HALF2_OPERATORS__',
                 '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR,
@@ -722,13 +686,9 @@ class CUDAOpBuilder(OpBuilder):
         else:
             cuda_major, _ = installed_cuda_version()
             args += [
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '--use_fast_math',
-                '-std=c++17'
-                if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-                '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__',
-                '-U__CUDA_NO_HALF2_OPERATORS__'
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
+                '-std=c++17' if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
             ]
             if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                 args.append('--ptxas-options=-v')
@@ -742,10 +702,12 @@ class CUDAOpBuilder(OpBuilder):
         if sys.platform == "win32":
             return ['cublas', 'curand']
         else:
+            #return []
             return ['iomp5']
 
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
+
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']
diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py
index bf7c98052fc541f2de5a6c331d414d522bc9e821..6d70c93faac21a79062abf54fc9a1e96e607993b 100644
--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from .builder import TorchCPUOpBuilder
 
@@ -38,13 +40,8 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
             CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
             ]
         return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index ed5f32929a22cd754c0466450fd0306ee040bbf6..29cdced0d9f2010c6e5c5f24e05941b6ee94c6d3 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 from .builder import TorchCPUOpBuilder
 
@@ -8,7 +10,7 @@ from .builder import TorchCPUOpBuilder
 class CPUAdamBuilder(TorchCPUOpBuilder):
     BUILD_VAR = "DS_BUILD_CPU_ADAM"
     NAME = "cpu_adam"
-     
+
     def __init__(self):
         super().__init__(name=self.NAME)
 
@@ -18,7 +20,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
     def sources(self):
         if self.build_for_cpu:
             return ['csrc/adam/cpu_adam.cpp']
-   
+
         return ['csrc/adam/cpu_adam.cpp', 'csrc/common/custom_cuda_kernel.cu']
 
     def libraries_args(self):
@@ -28,6 +30,7 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
 
         if not self.is_rocm_pytorch():
             args += ['curand']
+
         return args
 
     def include_paths(self):
@@ -38,13 +41,8 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
             CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
         else:
             CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
-                             "include",
-                             "hiprand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
             ]
         return ['csrc/includes'] + CUDA_INCLUDE
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index 2883d417ede9e0e66b356ff26669e0f80a20419c..ac6e4eeaaea5d9b2c2ee70de3d4261c6348abe94 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 import sys
@@ -29,9 +31,7 @@ class FusedAdamBuilder(CUDAOpBuilder):
     def nvcc_args(self):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if not self.is_rocm_pytorch():
-            nvcc_flags.extend([
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '-lineinfo',
-                '--use_fast_math'
-            ] + self.compute_capability_args())
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index d5f88d0b1ad1630950e319bc941b04d91b13338a..f0cb557706b3f7e6d4ab73d7a999a70aebecbeb3 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 import sys
@@ -30,14 +32,9 @@ class FusedLambBuilder(CUDAOpBuilder):
         nvcc_flags = ['-O3'] + self.version_dependent_macros()
         if self.is_rocm_pytorch():
             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
-            nvcc_flags += [
-                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
-                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
-            ]
+            nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
         else:
-            nvcc_flags.extend([
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '',
-                '-lineinfo',
-                '--use_fast_math'
-            ] + self.compute_capability_args())
+            nvcc_flags.extend(
+                ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+                self.compute_capability_args())
         return nvcc_flags
diff --git a/op_builder/quantizer.py b/op_builder/quantizer.py
index e2c2c9564a29d194ce28b6107e90b7db4da6ba95..a64d1603d1e5eb530a65c65f7cdee00d30bf7628 100644
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .builder import CUDAOpBuilder
 
diff --git a/op_builder/random_ltd.py b/op_builder/random_ltd.py
index 79c86c1346c5bf89f87dba5758017999c2b9faae..3fdc777215daf5cde158eecd9621a1edcd856421 100644
--- a/op_builder/random_ltd.py
+++ b/op_builder/random_ltd.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 
@@ -23,18 +25,13 @@ class RandomLTDBuilder(CUDAOpBuilder):
 
     def sources(self):
         return [
-            'csrc/random_ltd/pt_binding.cpp',
-            'csrc/random_ltd/gather_scatter.cu',
-            'csrc/random_ltd/slice_attn_masks.cu',
-            'csrc/random_ltd/token_sort.cu'
+            'csrc/random_ltd/pt_binding.cpp', 'csrc/random_ltd/gather_scatter.cu',
+            'csrc/random_ltd/slice_attn_masks.cu', 'csrc/random_ltd/token_sort.cu'
         ]
 
     def include_paths(self):
         includes = ['csrc/includes']
         if self.is_rocm_pytorch():
             from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
         return includes
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
index 25d5f63a01e97df553656d096cf4544b6b66c9f2..4257ac404e94e0ea148a08d9d51c4cf8a7467a89 100644
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import OpBuilder
 
 try:
@@ -32,9 +34,8 @@ class SparseAttnBuilder(OpBuilder):
         #deps_compatible = all(command_status)
 
         if self.is_rocm_pytorch():
-        #aiss debug
             #self.warning(f'{self.NAME} is not compatible with ROCM')
-            #return False
+#aiss debug
             return True
 
         try:
@@ -49,26 +50,23 @@ class SparseAttnBuilder(OpBuilder):
             self.warning(f"{self.NAME} cuda is not available from torch")
         else:
             major, minor = torch.version.cuda.split('.')[:2]
-            cuda_compatible = (int(major) == 10
-                               and int(minor) >= 1) or (int(major) >= 11)
+            cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
             if not cuda_compatible:
                 self.warning(f"{self.NAME} requires CUDA version 10.1+")
 
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
-        torch_compatible = TORCH_MAJOR == 1 and TORCH_MINOR >= 5
+        torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
         if not torch_compatible:
             self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 but detected {TORCH_MAJOR}.{TORCH_MINOR}'
-            )
+                f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
 
         try:
             import triton
         except ImportError:
             # auto-install of triton is broken on some systems, reverting to manual install for now
             # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(
-                f"please install triton==1.0.0 if you want to use sparse attention")
+            self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
             return False
 
         if pkg_version:
@@ -79,9 +77,7 @@ class SparseAttnBuilder(OpBuilder):
             triton_mismatch = installed_triton != "1.0.0"
 
         if triton_mismatch:
-            self.warning(
-                f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible"
-            )
+            self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
             return False
 
         return super().is_compatible(verbose) and torch_compatible and cuda_compatible
diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
index 18d19d40e86f1c964012da3183e62ab00603b09a..59caf57f938db217c52fcfa483e47fc3e451e6ea 100644
--- a/op_builder/spatial_inference.py
+++ b/op_builder/spatial_inference.py
@@ -1,6 +1,8 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder, installed_cuda_version
 
 
@@ -19,8 +21,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
         try:
             import torch
         except ImportError:
-            self.warning(
-                "Please install torch if trying to pre-compile inference kernels")
+            self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -30,8 +31,7 @@ class SpatialInferenceBuilder(CUDAOpBuilder):
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
-                        "On Ampere and higher architectures please use CUDA 11+")
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/stochastic_transformer.py b/op_builder/stochastic_transformer.py
index aa47c13c49e4b64a0a30c089825104ae075bbbbb..52b02a3c629e9d537b8e22139f48f5335396e4ff 100644
--- a/op_builder/stochastic_transformer.py
+++ b/op_builder/stochastic_transformer.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .transformer import TransformerBuilder
 
 
diff --git a/op_builder/transformer.py b/op_builder/transformer.py
index 239f29552d980984dae4884e5a3272e6a30b68ce..893145d44d94b9e71e9cbdfec3829df7818f6de7 100644
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import CUDAOpBuilder
 
 
@@ -23,22 +25,15 @@ class TransformerBuilder(CUDAOpBuilder):
 
     def sources(self):
         return [
-            'csrc/transformer/ds_transformer_cuda.cpp',
-            'csrc/transformer/cublas_wrappers.cu',
-            'csrc/transformer/transform_kernels.cu',
-            'csrc/transformer/gelu_kernels.cu',
-            'csrc/transformer/dropout_kernels.cu',
-            'csrc/transformer/normalize_kernels.cu',
-            'csrc/transformer/softmax_kernels.cu',
-            'csrc/transformer/general_kernels.cu'
+            'csrc/transformer/ds_transformer_cuda.cpp', 'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu', 'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu', 'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu', 'csrc/transformer/general_kernels.cu'
         ]
 
     def include_paths(self):
         includes = ['csrc/includes']
         if self.is_rocm_pytorch():
             from torch.utils.cpp_extension import ROCM_HOME
-            includes += [
-                '{}/hiprand/include'.format(ROCM_HOME),
-                '{}/rocrand/include'.format(ROCM_HOME)
-            ]
+            includes += ['{}/hiprand/include'.format(ROCM_HOME), '{}/rocrand/include'.format(ROCM_HOME)]
         return includes
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
index 9bb9bbb956b3e181c1b044df6eb422489bb97386..c7b95883cebf6b76ea0f2cb788301647324cb215 100755
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .builder import CUDAOpBuilder, installed_cuda_version
 
@@ -18,8 +21,7 @@ class InferenceBuilder(CUDAOpBuilder):
         try:
             import torch
         except ImportError:
-            self.warning(
-                "Please install torch if trying to pre-compile inference kernels")
+            self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -28,14 +30,11 @@ class InferenceBuilder(CUDAOpBuilder):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability < 6:
-                self.warning(
-                    "NVIDIA Inference is only supported on Pascal and newer architectures"
-                )
+                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning(
-                        "On Ampere and higher architectures please use CUDA 11+")
+                    self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/utils.py b/op_builder/utils.py
index 02d4daa41680aaeaa9bac923b9c79e704b2c2b17..c1095339777515c7f4da34159017a9d833ffd277 100644
--- a/op_builder/utils.py
+++ b/op_builder/utils.py
@@ -1,6 +1,8 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from .builder import OpBuilder
 
 
diff --git a/release/bump_patch_version.py b/release/bump_patch_version.py
index 40d9badf09ecf9099b6227e198e4830f949c9ee6..84cb45a8eac80d8ceda503784da26aa4e076177f 100644
--- a/release/bump_patch_version.py
+++ b/release/bump_patch_version.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from packaging import version as pkg_version
 
diff --git a/requirements/requirements-1bit-adam.txt b/requirements/requirements-1bit-adam.txt
deleted file mode 100644
index 66c5ba0468f857f33547a9cabf6c928808e38393..0000000000000000000000000000000000000000
--- a/requirements/requirements-1bit-adam.txt
+++ /dev/null
@@ -1 +0,0 @@
-mpi4py
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index a5ee03c61b555264716f36269462d00766626fce..2b93141c22b1c2d48d04ea9d7c891c1afc9d9fcc 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,8 +1,7 @@
-clang-format>=14.0.6
+clang-format==16.0.2
 docutils<0.18
 future
 importlib-metadata>=4
-megatron-lm==1.1.5
 pre-commit>=2.20.0
 pytest
 pytest-forked
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index fcd0ec5a9a6a4846ce7066f21953bb0af695af29..a6d7915e0ea55f42b116024dd10c5b566a0b377d 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,9 +1,9 @@
-autodoc_pydantic
+autodoc_pydantic<2.0.0
 docutils<0.18
 hjson
 packaging
 psutil
 py-cpuinfo
-pydantic
+pydantic<2.0.0
 torch
 tqdm
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
index 5151bc1328bc53f130fd127ad3612a2ab0c26e5b..1d96a83c78a0644e2ff3e81ac12024f3ad69bae3 100644
--- a/requirements/requirements-sd.txt
+++ b/requirements/requirements-sd.txt
@@ -1,2 +1,2 @@
 diffusers
-#triton==2.0.0.dev20221005
+#triton==2.0.0.dev20221202
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 080812208bc23eb46faf6e951750404efc45e018..fe9321bd0fba9225d5a525d763dec20fd5fdf01c 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -4,6 +4,6 @@ numpy
 packaging>=20.0
 psutil
 py-cpuinfo
-pydantic
+pydantic<2.0.0
 #torch
 tqdm
diff --git a/requirements/run_pip.sh b/requirements/run_pip.sh
index 4cfcfe7ed3a70391c5ee28f8518bd54f2dc17054..24aac22395637cbec6881b1c80c773269242096d 100644
--- a/requirements/run_pip.sh
+++ b/requirements/run_pip.sh
@@ -8,4 +8,3 @@ pip3 install -r  requirements-inf.txt
 pip3 install -r  requirements-sd.txt
 pip3 install -r  requirements.txt
 
-
diff --git a/run.sh b/run.sh
index 193ba16b024776bf2571a62de7e2b4aa8e48ddb1..04d9e963962621ed3c56012f87b4c1d35816ff77 100644
--- a/run.sh
+++ b/run.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
-source `pwd`/env.sh
+export LIBRARY_PATH=/usr/local/lib:$LIBRARY_PATH
 export DS_BUILD_VERSION=dtk22.10.1
+export LD_LIBRARY_PATH=/usr/local/lib/python3.7/site-packages/torch/lib:$LD_LIBRARY_PATH
 DS_BUILD_RANDOM_LTD=0 DS_BUILD_QUANTIZER=0 DS_BUILD_TRANSFORMER_INFERENCE=0 DS_BUILD_OPS=1 verbose=1 CXX=hipcc CC=hipcc python3 setup.py install bdist_wheel
diff --git a/scripts/check-license.py b/scripts/check-license.py
index 519827d7df670c06bd6aedd1817d3c49f3101f6a..67caa30a3e3fc2606635e97fcf88e829f77a514a 100755
--- a/scripts/check-license.py
+++ b/scripts/check-license.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -13,24 +18,25 @@ def err(s: str) -> None:
     print(s, file=sys.stderr)
 
 
+COPYRIGHT = [
+    r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$",
+    r"^\(\/\/\|#\) DeepSpeed Team$"
+]
+
 success = True
 failures = []
 for f in sys.argv[1:]:
-    res = subprocess.run(
-        ["git",
-         "grep",
-         "--quiet",
-         "-e",
-         r"Copyright .* DeepSpeed Team",
-         f],
-        capture_output=True)
-    if res.returncode == 1:
-        success = False
-        failures.append(f)
-    elif res.returncode == 2:
-        err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
-        err(res.stderr.decode("utf-8"))
-        sys.exit(2)
+    for copyright_line in COPYRIGHT:
+        if not success:
+            break
+        res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True)
+        if res.returncode == 1:
+            success = False
+            failures.append(f)
+        elif res.returncode == 2:
+            err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+            err(res.stderr.decode("utf-8"))
+            sys.exit(2)
 
 if not success:
     err(f'{failures}: Missing license at top of file')
diff --git a/scripts/check-torchcuda.py b/scripts/check-torchcuda.py
new file mode 100755
index 0000000000000000000000000000000000000000..04207173e227f0ef0daa7ec5f7ed1451ee195a98
--- /dev/null
+++ b/scripts/check-torchcuda.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Checks each file in sys.argv for the string "torch.cuda".
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+
+import subprocess
+import sys
+
+
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+
+
+# There are many ways we could search for the string "torch.cuda", but `git
+# grep --no-index` is nice because
+#  - it's very fast (as compared to iterating over the file in Python)
+#  - we can reasonably assume it's available on all machines
+#  - unlike plain grep, which is slower and has different flags on MacOS versus
+#    Linux, git grep is always the same.
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", "-e", r"torch\.cuda", "--and", "--not", "-e", "#ignore-cuda", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string "torch.cuda" was found.\nPlease replace all calls to torch.cuda with "get_accelerator()" and add the following import line:\n\n    from deepspeed.accelerator import get_accelerator\n\nIf your code is mean to be cuda specific, please add the following comment in the line with torch.cuda:\n\n    #ignore-cuda\n'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
+
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", r"\.cuda()", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string ".cuda()" was found. This implies convert a tensor to cuda tensor.  Please replace all calls to tensor.cuda() with "tensor.to(get_accelerator().device_name())" and add the following import line:\nfrom deepspeed.accelerator import get_accelerator'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
diff --git a/scripts/check-torchdist.py b/scripts/check-torchdist.py
index d655b7b9008e2b47ee1b07fe3b4b44a3fc5d5c56..f0328aca6469bf34f3896eb704527bcfedb11534 100755
--- a/scripts/check-torchdist.py
+++ b/scripts/check-torchdist.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import annotations
 '''Copyright The Microsoft DeepSpeed Team'''
 """
@@ -21,12 +26,7 @@ def err(s: str) -> None:
 #  - unlike plain grep, which is slower and has different flags on MacOS versus
 #    Linux, git grep is always the same.
 res = subprocess.run(
-    ["git",
-     "grep",
-     "-Hn",
-     "--no-index",
-     r"torch\.distributed",
-     *sys.argv[1:]],
+    ["git", "grep", "-Hn", "--no-index", r"torch\.distributed", *sys.argv[1:]],
     capture_output=True,
 )
 if res.returncode == 0:
diff --git a/scripts/replace_copyright.py b/scripts/replace_copyright.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0697509d29b927c04e88186edb5ba0ae3bf7b8c
--- /dev/null
+++ b/scripts/replace_copyright.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+USAGE:
+$ python3 script/replace_copyright.py --repo_dir ./
+"""
+
+import os
+import argparse
+
+NEW_COPYRIGHT = ("Copyright (c) Microsoft Corporation.", "SPDX-License-Identifier: Apache-2.0", "", "DeepSpeed Team")
+
+PY_SL_COMMENT = "#"
+PY_ML_SINGLE = "'''"
+PY_ML_DOUBLE = '"""'
+PY_COMMENTS = (PY_SL_COMMENT, PY_ML_SINGLE, PY_ML_DOUBLE)
+
+C_SL_COMMENT = "//"
+C_ML_OPEN = "/*"
+C_ML_CLOSE = "*/"
+C_COMMENTS = (C_SL_COMMENT, C_ML_OPEN, C_ML_CLOSE)
+
+BASH_SL_COMMENT = "#"
+BASH_COMMENTS = (BASH_SL_COMMENT, )
+
+DELIM = "|/-\|/-\|BARRIER|/-\|/-\|"  # noqa: W605
+
+
+def parser_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--repo_dir", type=str, help="Repository directory")
+    parser.add_argument("--python_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".py"],
+                        help="File types to process with python-style comments")
+    parser.add_argument("--bash_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[".sh"],
+                        help="File types to process with bash-style comments")
+    parser.add_argument("--c_style_ext",
+                        type=str,
+                        nargs="+",
+                        default=[
+                            ".c",
+                            ".cpp",
+                            ".cu",
+                            ".h",
+                            ".hpp",
+                            ".cuh",
+                            ".cc",
+                            ".hip",
+                            ".tr",
+                        ],
+                        help="File types to process with C-style comments")
+    args = parser.parse_args()
+    return args
+
+
+# These get_header_* functions are ugly, but they work :)
+def get_header_py(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+    in_multiline = False
+    multiline_type = None
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(PY_ML_SINGLE) or l.startswith(PY_ML_DOUBLE):
+            # Detected multiline comment
+            if in_multiline and multiline_type == l[:3]:
+                # Ended a multiline comment
+                in_multiline = False
+            else:
+                # Started a multiline comment
+                in_multiline = True
+                multiline_type = l[:3]
+            if l.endswith(multiline_type) and len(l) >= 6:
+                # Opened and closed multiline comment on single line
+                in_multiline = False
+        elif in_multiline and l.endswith(multiline_type):
+            # Ended a multiline comment
+            in_multiline = False
+        elif not (in_multiline or l.startswith(PY_SL_COMMENT) or l == ""):
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def get_header_c(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+    in_multiline = False
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if l.startswith(C_ML_OPEN):
+            # Detected multiline comment
+            if not l.endswith(C_ML_CLOSE):
+                # multiline comment not closed on same line
+                in_multiline = True
+        elif l.endswith(C_ML_CLOSE):
+            # Ended a multline comment
+            in_multiline = False
+        elif not in_multiline or l.startswith(C_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def get_header_bash(fp):
+    with open(fp, "r") as f:
+        lines = iter(l for l in f.readlines())
+
+    header = []
+    rest = []
+
+    while (l := next(lines, None)) is not None:
+        l = l.strip()
+        if not l.startswith(BASH_SL_COMMENT) or l.isspace():
+            # Not in a comment
+            rest += [l + "\n"]
+            break
+        header.append(l)
+
+    rest += list(lines)
+
+    return header, rest
+
+
+def remove_comments(line, comment_strs):
+    for cstr in comment_strs:
+        line = line.replace(cstr, "")
+    return line
+
+
+def format_multiline_comment(text, comment_type):
+    if comment_type == PY_COMMENTS:
+        text = f"\n{comment_type[2]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == C_COMMENTS:
+        text = f"\n{comment_type[1]}\n" + "\n".join(text) + f"{comment_type[2]}"
+    if comment_type == BASH_COMMENTS:
+        text = "\n".join([f"{comment_type[0]}{l}" for l in text])
+    return text
+
+
+def modify_file_header(fp, file_header, rest_of_file, preserve_text_store, comment_type):
+    header_text = "\n".join(file_header)
+    if not (header_text.strip() == "" or header_text in preserve_text_store):
+        # Unique header, need to get user input
+        print("\n", DELIM, "\n")
+        for idx, line in enumerate(file_header):
+            print(f"{idx}: {line}")
+        print("\n", DELIM, "\n")
+        print("\nIndicate the FIRST line of the Header to KEEP")
+        print("(shebang #! lines will be automatically processed and should not be included).")
+        keep_idx = input("Enter number (or leave blank if no lines should be preserved): ")
+        preserve_text_store[header_text] = file_header[int(keep_idx):] if keep_idx != "" else ""
+
+    # Identify any shebang lines in the file
+    shebang = "\n".join([l for l in file_header if l.startswith("#!")])
+    if shebang != "":
+        shebang += "\n"
+
+    # Get the text we should preserve in this file and process to remove comment characters
+    text_to_preserve = preserve_text_store.get(header_text, [""])
+    text_to_preserve = [remove_comments(l, comment_type) for l in text_to_preserve]
+
+    # Format the text we want to keep into a new multiline comment
+    if "".join(text_to_preserve) == "":
+        text_to_preserve = ""
+    else:
+        text_to_preserve = format_multiline_comment(text_to_preserve, comment_type)
+
+    # Generate the copyright text we will be adding
+    copyright_text = "\n".join([f"{comment_type[0]} {l}" if l != "" else l for l in NEW_COPYRIGHT])
+
+    # Assemble the new header
+    new_header = shebang + copyright_text + text_to_preserve
+
+    # Write out the new file
+    new_file_contents = new_header + "\n" + "".join(rest_of_file)
+    with open(fp, "w") as f:
+        f.write(new_file_contents)
+
+    return preserve_text_store  # Return so we can reuse for future files
+
+
+def main(args):
+    preserve_text_store = {}  # Used to track header comments we should preserve
+    for root, dirs, fnames in os.walk(args.repo_dir):
+        # Walk across directory looking for all files with extensions we want to modify
+        for ext in args.python_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_py(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         PY_COMMENTS)
+        for ext in args.c_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_c(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         C_COMMENTS)
+        for ext in args.bash_style_ext:
+            fpaths = [os.path.join(root, fn) for fn in fnames if fn.endswith(ext)]
+            for fp in fpaths:
+                file_header, rest_of_file = get_header_bash(fp)
+                preserve_text_store = modify_file_header(fp, file_header, rest_of_file, preserve_text_store,
+                                                         BASH_COMMENTS)
+
+
+if __name__ == "__main__":
+    args = parser_args()
+    main(args)
diff --git a/setup.py b/setup.py
index 2c81467e9802092d919569103cd99d19aaa8a543..8b80c8b1599cce6e95a6ccd0fab4813dd22a6d46 100755
--- a/setup.py
+++ b/setup.py
@@ -1,16 +1,19 @@
-"""
-Copyright 2020 The Microsoft DeepSpeed Team
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+"""
 DeepSpeed library
 
 To build wheel on Windows:
-    1. Install pytorch, such as pytorch 1.12 + cuda 11.6
-    2. Install visual cpp build tool
-    3. Include cuda toolkit
-    4. Launch cmd console with Administrator privilege for creating required symlink folders
+1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
+2. Install visual cpp build tool.
+3. Include cuda toolkit.
+4. Launch cmd console with Administrator privilege for creating required symlink folders.
+
 
 Create a new wheel via the following command:
-    build_win.bat
+build_win.bat
 
 The wheel will be located at: dist/*.whl
 """
@@ -34,7 +37,7 @@ from op_builder import get_default_compute_capabilities, OpBuilder
 from op_builder.all_ops import ALL_OPS
 from op_builder.builder import installed_cuda_version
 
-# fetch rocm state
+# Fetch rocm state.
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
 
@@ -55,7 +58,7 @@ def fetch_requirements(path):
 
 install_requires = fetch_requirements('requirements/requirements.txt')
 extras_require = {
-    '1bit': [], # add cupy based on cuda/rocm version
+    '1bit': [],  # add cupy based on cuda/rocm version
     '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
     'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
     'dev': fetch_requirements('requirements/requirements-dev.txt'),
@@ -66,21 +69,26 @@ extras_require = {
     'sd': fetch_requirements('requirements/requirements-sd.txt')
 }
 
-# Add specific cupy version to both onebit extension variants
+# Add specific cupy version to both onebit extension variants.
 if torch_available and torch.cuda.is_available():
     cupy = None
     if is_rocm_pytorch:
         rocm_major, rocm_minor = rocm_version
-        # XXX cupy support for rocm 5 is not available yet
+        # XXX cupy support for rocm 5 is not available yet.
         if rocm_major <= 4:
             cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
     else:
-        cupy = f"cupy-cuda{''.join(map(str,installed_cuda_version()))}"
+        cuda_major_ver, cuda_minor_ver = installed_cuda_version()
+        if (cuda_major_ver < 11) or ((cuda_major_ver == 11) and (cuda_minor_ver < 3)):
+            cupy = f"cupy-cuda{cuda_major_ver}{cuda_minor_ver}"
+        else:
+            cupy = f"cupy-cuda{cuda_major_ver}x"
+
     if cupy:
         extras_require['1bit'].append(cupy)
         extras_require['1bit_mpi'].append(cupy)
 
-# Make an [all] extra that installs all needed dependencies
+# Make an [all] extra that installs all needed dependencies.
 all_extras = set()
 for extra in extras_require.items():
     for req in extra[1]:
@@ -89,11 +97,10 @@ extras_require['all'] = list(all_extras)
 
 cmdclass = {}
 
-# For any pre-installed ops force disable ninja
+# For any pre-installed ops force disable ninja.
 if torch_available:
     from accelerator import get_accelerator
-    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(
-        use_ninja=False)
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
 
 if torch_available:
     TORCH_MAJOR = torch.__version__.split('.')[0]
@@ -103,11 +110,10 @@ else:
     TORCH_MINOR = "0"
 
 if torch_available and not torch.cuda.is_available():
-    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
-    print(
-        "[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
-        "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
-        "(compute capabilities 6.0, 6.1, 6.2)")
+    # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
+    print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
+          "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
+          "(compute capabilities 6.0, 6.1, 6.2)")
     if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
         os.environ["TORCH_CUDA_ARCH_LIST"] = get_default_compute_capabilities()
 
@@ -150,18 +156,19 @@ for op_name, builder in ALL_OPS.items():
     print("op_name: ", op_name)
     print("op_enabled: ", op_enabled(op_name))
     print("op_compatible: ", op_compatible)
-    # If op is requested but not available, throw an error
+
+    # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
         env_var = op_envvar(op_name)
         if env_var not in os.environ:
             builder.warning(f"One can disable {op_name} with {env_var}=0")
         abort(f"Unable to pre-compile {op_name}")
 
-    # if op is compatible but install is not enabled (JIT mode)
+    # If op is compatible but install is not enabled (JIT mode).
     if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
         builder.hipify_extension()
 
-    # If op install enabled, add builder to extensions
+    # If op install enabled, add builder to extensions.
     if op_enabled(op_name) and op_compatible:
         assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
         install_ops[op_name] = op_enabled(op_name)
@@ -169,7 +176,7 @@ for op_name, builder in ALL_OPS.items():
 
 print(f'Install Ops={install_ops}')
 
-# Write out version/git info
+# Write out version/git info.
 git_hash_cmd = "git rev-parse --short HEAD"
 git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
 if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
@@ -178,7 +185,7 @@ if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
         git_hash = result.decode('utf-8').strip()
         result = subprocess.check_output(git_branch_cmd, shell=True)
         git_branch = result.decode('utf-8').strip()
-#add dtk version
+        #add dtk version
         if os.getenv('DS_BUILD_VERSION'):
             version_dtk = os.getenv('DS_BUILD_VERSION', "")
             git_hash += "." + version_dtk
@@ -207,38 +214,38 @@ if sys.platform == "win32":
     create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
     egg_info.manifest_maker.template = 'MANIFEST_win.in'
 
-# Parse the DeepSpeed version string from version.txt
+# Parse the DeepSpeed version string from version.txt.
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
+# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
 
-# Building wheel for distribution, update version file
+# Building wheel for distribution, update version file.
 if 'DS_BUILD_STRING' in os.environ:
-    # Build string env specified, probably building for distribution
+    # Build string env specified, probably building for distribution.
     with open('build.txt', 'w') as fd:
         fd.write(os.environ.get('DS_BUILD_STRING'))
     version_str += os.environ.get('DS_BUILD_STRING')
 elif os.path.isfile('build.txt'):
-    # build.txt exists, probably installing from distribution
+    # build.txt exists, probably installing from distribution.
     with open('build.txt', 'r') as fd:
         version_str += fd.read().strip()
 else:
-    # None of the above, probably installing from source
+    # None of the above, probably installing from source.
     version_str += f'+{git_hash}'
 
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 bf16_support = False
-# Set cuda_version to 0.0 if cpu-only
+# Set cuda_version to 0.0 if cpu-only.
 cuda_version = "0.0"
 nccl_version = "0.0"
-# Set hip_version to 0.0 if cpu-only
+# Set hip_version to 0.0 if cpu-only.
 hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
     cuda_version = ".".join(torch.version.cuda.split('.')[:2])
     if sys.platform != "win32":
         if isinstance(torch.cuda.nccl.version(), int):
-            # This will break if minor version > 9
+            # This will break if minor version > 9.
             nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
         else:
             nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))
@@ -280,7 +287,7 @@ setup(name='deepspeed',
       long_description=readme_text,
       long_description_content_type='text/markdown',
       author='DeepSpeed Team',
-      author_email='deepspeed@microsoft.com',
+      author_email='deepspeed-info@microsoft.com',
       url='http://deepspeed.ai',
       project_urls={
           'Documentation': 'https://deepspeed.readthedocs.io',
@@ -288,26 +295,17 @@ setup(name='deepspeed',
       },
       install_requires=install_requires,
       extras_require=extras_require,
-      #packages=find_packages(include=['deepspeed',
-      #                                'deepspeed.*']),
+      #packages=find_packages(include=['deepspeed', 'deepspeed.*']),
       packages=find_namespace_packages(include=['deepspeed',
                                       'deepspeed.*']),
       include_package_data=True,
       scripts=[
-          'bin/deepspeed',
-          'bin/deepspeed.pt',
-          'bin/ds',
-          'bin/ds_ssh',
-          'bin/ds_report',
-          'bin/ds_bench',
-          'bin/dsr',
+          'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
           'bin/ds_elastic'
       ],
       classifiers=[
-          'Programming Language :: Python :: 3.6',
-          'Programming Language :: Python :: 3.7',
-          'Programming Language :: Python :: 3.8',
-          'Programming Language :: Python :: 3.9',
+          'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
+          'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
           'Programming Language :: Python :: 3.10'
       ],
       license='MIT',
diff --git a/tests/accelerator/test_ds_init.py b/tests/accelerator/test_ds_init.py
index d535e9918052a9168faa811b56d69b3c8e447e90..9594a6f5ea58a384d182573523c78b01ccec1db9 100644
--- a/tests/accelerator/test_ds_init.py
+++ b/tests/accelerator/test_ds_init.py
@@ -1,4 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import os
 import torch
 import deepspeed
@@ -6,6 +10,7 @@ from deepspeed.accelerator import get_accelerator
 
 
 class OneLayerNet(torch.nn.Module):
+
     def __init__(self, D_in, D_out):
         """
         In the constructor we instantiate two nn.Linear modules and assign them as
@@ -38,10 +43,6 @@ def test_literal_device():
     string = get_accelerator().device_name()  #'xpu' or 'cuda'
     string0 = get_accelerator().device_name(0)  #'xpu:0' or 'cuda:0'
     string1 = get_accelerator().device_name(1)  #'xpu:1' or 'cuda:1'
-    #aiss
-    print(string0)
-    print(string1)
-    
     assert string == 'xpu' or string == 'cuda'
     assert string0 == 'xpu:0' or string0 == 'cuda:0'
     assert string1 == 'xpu:1' or string1 == 'cuda:1'
diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py
index 1082554f81d16a8d389a866af49586d4ed140d5e..d404acd5c3440459fe9873d9f38f47f125e1a6b6 100755
--- a/tests/benchmarks/flatten_bench.py
+++ b/tests/benchmarks/flatten_bench.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #!/usr/bin/env python
 # run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
@@ -26,12 +29,9 @@ unflatten = util_ops.unflatten
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
-                512)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
 ]
 t = x * 30
 
diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py
index a4a1b63b3dd0a4e5347d93cac365854e0d711013..dade4574458ab393a70f8e7bf211658d7db19ab4 100755
--- a/tests/benchmarks/unflatten_bench.py
+++ b/tests/benchmarks/unflatten_bench.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 #!/usr/bin/env python
 
@@ -26,12 +29,9 @@ unflatten = util_ops.unflatten
 torch.manual_seed(0)
 # emulate a small typical model weights
 x = [
-    torch.rand((512,
-                512)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                1024)).to(get_accelerator().device_name()),
-    torch.rand((512,
-                30000)).to(get_accelerator().device_name())
+    torch.rand((512, 512)).to(get_accelerator().device_name()),
+    torch.rand((512, 1024)).to(get_accelerator().device_name()),
+    torch.rand((512, 30000)).to(get_accelerator().device_name())
 ]
 unflat_t = x * 30
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 86662993a4fb4f4ba7cb50f26c9bc78cd0fbc955..e5a8cce45fd95b834571c22472ea00dc211ccf1f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # tests directory-specific settings - this file is run automatically by pytest before any tests are run
 
@@ -18,6 +21,13 @@ git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
 sys.path.insert(1, git_repo_path)
 
 
+def pytest_configure(config):
+    config.option.color = "yes"
+    config.option.durations = 0
+    config.option.durations_min = 1
+    config.option.verbose = True
+
+
 def pytest_addoption(parser):
     parser.addoption("--torch_ver", default=None, type=str)
     parser.addoption("--cuda_ver", default=None, type=str)
@@ -35,16 +45,14 @@ def check_environment(pytestconfig):
     expected_cuda_version = pytestconfig.getoption("cuda_ver")
     if expected_torch_version is None:
         warnings.warn(
-            "Running test without verifying torch version, please provide an expected torch version with --torch_ver"
-        )
+            "Running test without verifying torch version, please provide an expected torch version with --torch_ver")
     elif not validate_version(expected_torch_version, torch.__version__):
         pytest.exit(
             f"expected torch version {expected_torch_version} did not match found torch version {torch.__version__}",
             returncode=2)
     if expected_cuda_version is None:
         warnings.warn(
-            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver"
-        )
+            "Running test without verifying cuda version, please provide an expected cuda version with --cuda_ver")
     elif not validate_version(expected_cuda_version, torch.version.cuda):
         pytest.exit(
             f"expected cuda version {expected_cuda_version} did not match found cuda version {torch.version.cuda}",
diff --git a/tests/hybrid_engine/hybrid_engine_config.json b/tests/hybrid_engine/hybrid_engine_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..1d418ae8e019ea365572d4dde6384f35c98962fe
--- /dev/null
+++ b/tests/hybrid_engine/hybrid_engine_config.json
@@ -0,0 +1,19 @@
+{
+  "train_batch_size" : 32,
+  "train_micro_batch_size_per_gpu": 2,
+  "steps_per_print": 10,
+  "zero_optimization": {
+    "stage": 0,
+    "offload_param": {
+        "device": "cpu"
+    },
+    "stage3_param_persistence_threshold": 0
+  },
+  "fp16":{
+    "enabled": true,
+    "loss_scale_window": 100
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "wall_clock_breakdown" : false
+}
diff --git a/tests/hybrid_engine/hybrid_engine_test.py b/tests/hybrid_engine/hybrid_engine_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b8958a387e741615caa82cb411cb0f89ab67d0f
--- /dev/null
+++ b/tests/hybrid_engine/hybrid_engine_test.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from transformers import AutoModelForCausalLM
+import deepspeed
+import argparse
+from deepspeed.accelerator import get_accelerator
+
+deepspeed.runtime.utils.see_memory_usage('pre test', force=True)
+
+model = AutoModelForCausalLM.from_pretrained('facebook/opt-350M').half().to(get_accelerator().device_name())
+parser = argparse.ArgumentParser()
+parser = deepspeed.add_config_arguments(parser)
+args = parser.parse_args()
+
+deepspeed.runtime.utils.see_memory_usage('post test', force=True)
+
+m, _, _, _ = deepspeed.initialize(model=model, args=args, enable_hybrid_engine=True)
+
+m.eval()
+input = torch.ones(1, 16, device='cuda', dtype=torch.long)
+out = m(input)
+
+m.train()
+out = m(input)
+print(out['logits'], out['logits'].norm())
diff --git a/tests/lightning/test_simple.py b/tests/lightning/test_simple.py
index c78768bc707a3ee6723e4f17fd8ce98dbe91a7ba..01f3f4e96571786972c6f24bf15464f0ec9da613 100644
--- a/tests/lightning/test_simple.py
+++ b/tests/lightning/test_simple.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from pytorch_lightning import LightningModule, Trainer
@@ -7,6 +10,7 @@ from torch.utils.data import DataLoader, Dataset
 
 
 class RandomDataset(Dataset):
+
     def __init__(self, size, length):
         self.len = length
         self.data = torch.randn(length, size)
@@ -19,6 +23,7 @@ class RandomDataset(Dataset):
 
 
 class BoringModel(LightningModule):
+
     def __init__(self):
         super().__init__()
         self.layer = torch.nn.Linear(32, 2)
@@ -53,5 +58,5 @@ def test_lightning_model():
     """Test that DeepSpeed works with a simple LightningModule and LightningDataModule."""
 
     model = BoringModel()
-    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, gpus=1)
+    trainer = Trainer(strategy=DeepSpeedStrategy(), max_epochs=1, precision=16, accelerator="gpu", devices=1)
     trainer.fit(model)
diff --git a/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/mp_rank_00_model_states.pt b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/mp_rank_00_model_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a7c499e0f4327329e7b2139b72b70347fef4aece
Binary files /dev/null and b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/mp_rank_00_model_states.pt differ
diff --git a/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a9e787e252292b183231fde18c32a8973231648d
Binary files /dev/null and b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/checkpoint/zero_pp_rank_0_mp_rank_00_optim_states.pt differ
diff --git a/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/latest b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/latest
new file mode 100644
index 0000000000000000000000000000000000000000..61f429046c2578bdcf22fb3530be24f8c2bca3c4
--- /dev/null
+++ b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/latest
@@ -0,0 +1 @@
+checkpoint
\ No newline at end of file
diff --git a/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/zero_to_fp32.py b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/zero_to_fp32.py
new file mode 100755
index 0000000000000000000000000000000000000000..c5246ff52274e1d6142001ccf085186d3545ce57
--- /dev/null
+++ b/tests/lightning_logs/version_0/checkpoints/epoch=0-step=32.ckpt/zero_to_fp32.py
@@ -0,0 +1,578 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+
+
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+
+
+debug = 0
+
+# load to cpu
+device = torch.device('cpu')
+
+
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+
+    # there should be only one file
+    if zero_stage == 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+
+    return file
+
+
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+
+        ds_version = state_dict.get(DS_VERSION, None)
+
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+
+    return zero_model_states
+
+
+def parse_optim_states(files, ds_checkpoint_dir):
+
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dicts.append(torch.load(f, map_location=device))
+
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+
+    if type(world_size) is list:
+        world_size = max(world_size)
+
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+
+    # the groups are named differently in each stage
+    if zero_stage == 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+
+    if zero_stage == 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+
+    return zero_stage, world_size, fp32_flat_groups
+
+
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+
+    model_files = get_model_state_files(ds_checkpoint_dir)
+
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+
+    if zero_stage == 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+
+
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        state_dict[name] = frozen_param_fragments[name]
+
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+
+            unpartitioned_numel = shape.numel()
+            total_numel += unpartitioned_numel
+            total_params += 1
+
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+
+    offset *= world_size
+
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+
+
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+
+    return state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+
+    Returns:
+        - pytorch ``state_dict``
+
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+
+
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+
+
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+
+    Returns:
+        - ``model`: modified model
+
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+
+    A typical usage might be ::
+
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+
+    return model
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+
+    debug = args.debug
+
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)
diff --git a/tests/lightning_logs/version_0/events.out.tfevents.1685169908.26388537c721.608516.0 b/tests/lightning_logs/version_0/events.out.tfevents.1685169908.26388537c721.608516.0
new file mode 100644
index 0000000000000000000000000000000000000000..ba70ec781aa8fcfb557c52c3f71008a116d12f86
Binary files /dev/null and b/tests/lightning_logs/version_0/events.out.tfevents.1685169908.26388537c721.608516.0 differ
diff --git a/tests/lightning_logs/version_0/hparams.yaml b/tests/lightning_logs/version_0/hparams.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0967ef424bce6791893e9a57bb952f80fd536e93
--- /dev/null
+++ b/tests/lightning_logs/version_0/hparams.yaml
@@ -0,0 +1 @@
+{}
diff --git a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
index 828771cd324b1902f80fd4f9b5cb368760920b30..d98091a8bdf5cd2e40fd0d43c2d166061dd301ac 100755
--- a/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
+++ b/tests/model/BingBertSquad/BingBertSquad_run_func_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import os
@@ -28,6 +32,7 @@ def grep_loss_from_file(file_name):
 
 
 class BingBertSquadFuncTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on BingBertSquad model"):
         super(BingBertSquadFuncTestCase, self).__init__(methodName)
 
@@ -112,8 +117,7 @@ class BingBertSquadFuncTestCase(BaseTestCase):
         prefix = "BingBertSquad_func"
 
         test_config['other_args'] += f" --max_steps {test_config['max_steps']}"
-        test_config[
-            'other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
+        test_config['other_args'] += f" --max_steps_per_epoch {test_config['max_epoch_steps']}"
 
         # baseline run...
         test_config["deepspeed"] = False
diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py
index b6069d76e69a6518e5abe99afb38f4e8cc10426e..ef42f85cc945601a48fb6bc6c1df6b6c7d46a6b6 100755
--- a/tests/model/BingBertSquad/BingBertSquad_test_common.py
+++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import unittest
 import subprocess
@@ -8,6 +10,7 @@ import time
 
 
 class BaseTestCase(unittest.TestCase):
+
     def __init__(self, methodName="DeepSpeed performance test"):
         super(BaseTestCase, self).__init__(methodName)
         self.test_dir = "./test"
@@ -23,10 +26,7 @@ class BaseTestCase(unittest.TestCase):
             other_args = "_" + other_args
 
         if test_config["deepspeed"]:
-            file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"],
-                                                           other_args,
-                                                           zero_args,
-                                                           self.timestr)
+            file_name = "_gpu{0}_{1}_ds{2}-{3}.log".format(test_config["gpus"], other_args, zero_args, self.timestr)
             save_dir = self.test_dir
         else:
             file_name = "_gpu{0}_{1}.log".format(test_config["gpus"], other_args)
@@ -46,22 +46,12 @@ class BaseTestCase(unittest.TestCase):
         time.sleep(20)
 
     def run_BingBertSquad_test(self, test_config, output):
-        ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config[
-            "deepspeed"] else " "
-        other_args = " " + test_config[
-            "other_args"] if "other_args" in test_config else " "
+        ds_flag = " -d --deepspeed_config " + test_config["json"] if test_config["deepspeed"] else " "
+        other_args = " " + test_config["other_args"] if "other_args" in test_config else " "
 
-        cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(
-            test_config["gpus"],
-            other_args,
-            ds_flag)
+        cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(test_config["gpus"], other_args, ds_flag)
 
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd,
-                           shell=True,
-                           check=False,
-                           executable='/bin/bash',
-                           stdout=f,
-                           stderr=f)
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/model/BingBertSquad/__init__.py b/tests/model/BingBertSquad/__init__.py
index e122adbdfddecd2663c881a24bdc5b775fae1f75..3b0e64cad41fc60d0d0ebf680e1db659b2a993e2 100755
--- a/tests/model/BingBertSquad/__init__.py
+++ b/tests/model/BingBertSquad/__init__.py
@@ -1,4 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from .BingBertSquad_run_func_test import BingBertSquadFuncTestCase
 from .BingBertSquad_run_func_test import suite
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 7dfd718bc6bd8cae0fb9428d05f18fb6d0c92204..9312dc67a19319244b1f54d26f6b4a451abf7d1b 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import subprocess as sp
 import os
@@ -63,16 +66,7 @@ def test_e2e_squad_deepspeed_base(tmpdir):
     output_dir = os.path.join(tmpdir, "output")
     pred_file = os.path.join(output_dir, pred_file_name)
 
-    proc = sp.Popen([
-        "bash",
-        script_file_name,
-        num_gpus,
-        model_file,
-        squad_dir,
-        output_dir,
-        config_file
-    ],
-                    cwd=base_dir)
+    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)
 
     try:
         proc.communicate(timeout=timeout_sec)
@@ -82,9 +76,7 @@ def test_e2e_squad_deepspeed_base(tmpdir):
 
             print("evaluation result: ", json.dumps(eval_result))
 
-            assert isclose(eval_result["exact_match"],
-                           expected_exact_match,
-                           abs_tol=1e-2)
+            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
             assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)
 
         else:
@@ -110,16 +102,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir):
     output_dir = os.path.join(tmpdir, "output")
     pred_file = os.path.join(output_dir, pred_file_name)
 
-    proc = sp.Popen([
-        "bash",
-        script_file_name,
-        num_gpus,
-        model_file,
-        squad_dir,
-        output_dir,
-        config_file
-    ],
-                    cwd=base_dir)
+    proc = sp.Popen(["bash", script_file_name, num_gpus, model_file, squad_dir, output_dir, config_file], cwd=base_dir)
 
     try:
         proc.communicate(timeout=timeout_sec)
@@ -129,9 +112,7 @@ def test_e2e_squad_deepspeed_zero(tmpdir):
 
             print("evaluation result: ", json.dumps(eval_result))
 
-            assert isclose(eval_result["exact_match"],
-                           expected_exact_match,
-                           abs_tol=1e-2)
+            assert isclose(eval_result["exact_match"], expected_exact_match, abs_tol=1e-2)
             assert isclose(eval_result["f1"], expected_f1, abs_tol=1e-2)
 
         else:
diff --git a/tests/model/Megatron_GPT2/__init__.py b/tests/model/Megatron_GPT2/__init__.py
index 2451ec7ae5bf8768066b6091362dda8dc9efc3b5..4180edf94aa565849adc6628cbd17c53364aa923 100644
--- a/tests/model/Megatron_GPT2/__init__.py
+++ b/tests/model/Megatron_GPT2/__init__.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 from .run_func_test import GPT2FuncTestCase
 from .run_checkpoint_test import GPT2CheckpointTestCase, checkpoint_suite
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
index 628547ef2f14f8120ddd84fdd90bed765257fed7..d97a28ff1ad5626afbcb246f41f1102fff43d1be 100755
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import subprocess
@@ -39,6 +43,7 @@ def grep_loss_from_file(file_name):
 
 
 class GPT2CheckpointTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
         super(GPT2CheckpointTestCase, self).__init__(methodName)
 
@@ -480,8 +485,7 @@ class GPT2CheckpointTestCase(BaseTestCase):
         #-----------------Loading Checkpoint-----------------#
 
         # building checkpoint arguments
-        test_config[
-            "other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
+        test_config["other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
 
         # set checkpoint load iteration
         try:
@@ -543,24 +547,20 @@ def checkpoint_suite():
     # Shrink DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
 
     # Expand DP
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
-    suite.addTest(
-        GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
+    suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
 
     suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
 
diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py
index 78a685e0f0e275088c0d86afe28063b5c0454c6f..0f5ad12c7c73e48555181cf490630ead016e1554 100755
--- a/tests/model/Megatron_GPT2/run_func_test.py
+++ b/tests/model/Megatron_GPT2/run_func_test.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import os
@@ -34,6 +38,7 @@ def grep_loss_from_file(file_name):
 
 
 class GPT2FuncTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
         super(GPT2FuncTestCase, self).__init__(methodName)
 
@@ -454,9 +459,7 @@ class GPT2FuncTestCase(BaseTestCase):
             baseline_deepspeed_config = True
 
         test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
-        base_file = self.gen_output_name(test_config,
-                                         baseline_prefix,
-                                         baseline_config=baseline_deepspeed_config)
+        base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
@@ -468,8 +471,7 @@ class GPT2FuncTestCase(BaseTestCase):
         # DeepSpeed run...
         test_config["deepspeed"] = True
         cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
-        test_config[
-            "other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
+        test_config["other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
         test_config["json"] = deepspeed_config
 
         print("{0}: DeepSpeed run.".format(self.id()))
@@ -502,9 +504,7 @@ class GPT2FuncTestCase(BaseTestCase):
         test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
 
         # baseline run...
-        base_file = self.gen_output_name(test_config,
-                                         baseline_prefix,
-                                         baseline_config=baseline_deepspeed_config)
+        base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config)
 
         # skip baseline run if it exists.
         if not self.has_loss_data(base_file):
diff --git a/tests/model/Megatron_GPT2/run_perf_baseline.py b/tests/model/Megatron_GPT2/run_perf_baseline.py
index 0c7233d5dc8f47d034386023205bd75970b718c7..0958f021045f5d47f04b5aa08988f0c85dc1249f 100755
--- a/tests/model/Megatron_GPT2/run_perf_baseline.py
+++ b/tests/model/Megatron_GPT2/run_perf_baseline.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import re
@@ -8,6 +12,7 @@ from test_common import BaseTestCase
 
 
 class GPT2PerfBaselineTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
         super(GPT2PerfBaselineTestCase, self).__init__(methodName)
 
@@ -88,9 +93,7 @@ class GPT2PerfBaselineTestCase(BaseTestCase):
         if exec_time == 0.0:
             print("{0}: no latency found in file {1}".format(self.id(), test_file))
         else:
-            print("{0}: execution time per iteration is {1}ms.".format(
-                self.id(),
-                exec_time))
+            print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time))
 
     def grep_latency_from_file(self, file_name):
         latency = 0.0
@@ -99,9 +102,7 @@ class GPT2PerfBaselineTestCase(BaseTestCase):
         with open(file_name, 'r') as f:
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
-            match_number = re.compile(
-                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
-            )
+            match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
             for line in lines:
                 if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/run_perf_test.py b/tests/model/Megatron_GPT2/run_perf_test.py
index f24b441291f9bc2afb16f32d892b7dd4269f1a94..3a144ab067ca5b1ea340352cab04d2152bac0085 100755
--- a/tests/model/Megatron_GPT2/run_perf_test.py
+++ b/tests/model/Megatron_GPT2/run_perf_test.py
@@ -1,14 +1,18 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import unittest
 import re
-#from test_common import BaseTestCase
-from .test_common import BaseTestCase
+from test_common import BaseTestCase
 
 
 class GPT2PerfTestCase(BaseTestCase):
+
     def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
         super(GPT2PerfTestCase, self).__init__(methodName)
 
@@ -93,9 +97,7 @@ class GPT2PerfTestCase(BaseTestCase):
         if exec_time == 0.0:
             print("{0}: no latency found in file {1}".format(self.id(), test_file))
         else:
-            print("{0}: execution time per iteration is {1}ms.".format(
-                self.id(),
-                exec_time))
+            print("{0}: execution time per iteration is {1}ms.".format(self.id(), exec_time))
 
     def grep_latency_from_file(self, file_name):
         latency = 0.0
@@ -104,9 +106,7 @@ class GPT2PerfTestCase(BaseTestCase):
         with open(file_name, 'r') as f:
             lines = f.readlines()
             line_filter = "elapsed time per iteration"
-            match_number = re.compile(
-                r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
-            )
+            match_number = re.compile(r'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
 
             for line in lines:
                 if line_filter in line:
diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py
index 6f9bec89eeb5b08a637ffaf751499db5534f4ff0..1bcd891e31d51579247daa7f5bbc0ce02f9d7e6e 100755
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import unittest
 import subprocess
@@ -8,6 +10,7 @@ import time
 
 
 class BaseTestCase(unittest.TestCase):
+
     def __init__(self, methodName="DeepSpeed performance test"):
         super(BaseTestCase, self).__init__(methodName)
         self.test_dir = "./test"
@@ -24,30 +27,14 @@ class BaseTestCase(unittest.TestCase):
 
         if test_config["deepspeed"] and not baseline_config:
             file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
-                test_config["mp"],
-                test_config["gpus"],
-                test_config["nodes"],
-                test_config["bs"],
-                test_config["steps"],
-                test_config["layers"],
-                test_config["hidden_size"],
-                test_config["seq_length"],
-                test_config["heads"],
-                other_args,
-                zero_args,
-                self.timestr)
+                test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+                test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
+                other_args, zero_args, self.timestr)
             save_dir = self.test_dir
         else:
             file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format(
-                test_config["mp"],
-                test_config["gpus"],
-                test_config["nodes"],
-                test_config["bs"],
-                test_config["steps"],
-                test_config["layers"],
-                test_config["hidden_size"],
-                test_config["seq_length"],
-                test_config["heads"],
+                test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+                test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
                 other_args)
             save_dir = self.baseline_dir
 
@@ -66,31 +53,15 @@ class BaseTestCase(unittest.TestCase):
 
     def run_gpt2_test(self, test_config, output):
         ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else ""
-        ckpt_num = test_config[
-            "ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
-        other_args = "-o " + test_config[
-            "other_args"] if "other_args" in test_config else ""
+        ckpt_num = test_config["ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
+        other_args = "-o " + test_config["other_args"] if "other_args" in test_config else ""
 
         cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format(
-            test_config["mp"],
-            test_config["gpus"],
-            test_config["nodes"],
-            test_config["bs"],
-            test_config["steps"],
-            test_config["layers"],
-            test_config["hidden_size"],
-            test_config["seq_length"],
-            test_config["heads"],
-            ckpt_num,
-            other_args,
-            ds_flag)
+            test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
+            test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
+            ckpt_num, other_args, ds_flag)
 
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd,
-                           shell=True,
-                           check=False,
-                           executable='/bin/bash',
-                           stdout=f,
-                           stderr=f)
+            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/model/run_sanity_check.py b/tests/model/run_sanity_check.py
index a226ccb8ca06c10c1a1f2204981a322213ae1d17..443f12046b27474fc5a202c0163a709b524dd681 100755
--- a/tests/model/run_sanity_check.py
+++ b/tests/model/run_sanity_check.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
-#
-# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Note: please copy webtext data to "Megatron-LM" folder, before running this script.
+"""
 
 import sys
 import unittest
diff --git a/tests/onebit/test_mpi_backend.py b/tests/onebit/test_mpi_backend.py
index bb8915f2c00164d24bb3164fb6eb7e828872fe90..5cc36317a8ddd5f985689cb0fa44c370074ed6ca 100644
--- a/tests/onebit/test_mpi_backend.py
+++ b/tests/onebit/test_mpi_backend.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from mpi4py import MPI
 import torch
@@ -35,8 +38,7 @@ def torch_sim(a):
     a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
     server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
     a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
     get_accelerator().synchronize()
diff --git a/tests/onebit/test_mpi_perf.py b/tests/onebit/test_mpi_perf.py
index dd67fdb615e8744e2777ace97b88c6ef322e4f05..4a4f6dbe8b4ada41856f0b8ea771d1cb1443dadc 100644
--- a/tests/onebit/test_mpi_perf.py
+++ b/tests/onebit/test_mpi_perf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from mpi4py import MPI
 import torch
diff --git a/tests/onebit/test_nccl_backend.py b/tests/onebit/test_nccl_backend.py
index e544865b7685782616e08c3d1ec031c8876f8e67..57edd4894c18aefff4204a4ef039d4519522c889 100644
--- a/tests/onebit/test_nccl_backend.py
+++ b/tests/onebit/test_nccl_backend.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -40,8 +43,7 @@ def torch_sim(a):
     a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
     server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
     a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-    a_server_compressed = torch.cat(
-        [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
     rank = dist.get_rank()
     server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
     get_accelerator().synchronize()
@@ -86,7 +88,6 @@ if test_correctness:
     else:
         check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
         if torch.sum(check_mag_mask) == 0:
-            print(
-                'Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
+            print('Successfully passed the test for NCCL Backend at Rank {}'.format(rank))
         else:
             print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_nccl_perf.py b/tests/onebit/test_nccl_perf.py
index aab93efac85179be569b8d72c28e6a79ba77efaf..d98e06e797a74d64e5c6955783701d70b721d6cc 100644
--- a/tests/onebit/test_nccl_perf.py
+++ b/tests/onebit/test_nccl_perf.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -83,9 +86,7 @@ if rank == 0:
 minlat = round(min(time_list) * convert)
 maxlat = round(max(time_list) * convert)
 meanlat = round(mean(time_list) * convert, places)
-print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat,
-                                                        maxlat,
-                                                        meanlat)) if rank == 0 else None
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None
 #print("tensor shape", a.shape)
 duration = meanlat / 1e3
 tput = ((tensor_size * 4) / duration)
diff --git a/tests/perf/adagrad_test.py b/tests/perf/adagrad_test.py
index 37ca85ed47d8b62771f01fd52ceef1687d6a4819..0dd7b9d20475ede1f2f6e265f88c11a42fe9ebb7 100755
--- a/tests/perf/adagrad_test.py
+++ b/tests/perf/adagrad_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
@@ -28,8 +31,7 @@ def _main():
     param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
     torch_time = _test_perf(param, torch.optim.Adagrad)
     ds_time = _test_perf(param, DeepSpeedCPUAdagrad)
-    #print(f"Step time: {torch_time=} {ds_time=}")
-    print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
+    print(f"Step time: {torch_time=} {ds_time=}")
 
 
 _main()
diff --git a/tests/perf/adam_test.py b/tests/perf/adam_test.py
index 0c83bfa62984ddf398c88f3a49f34703155f9fa5..1a4dfcdab0d3fef5595dd6ede18d70e73268a224 100755
--- a/tests/perf/adam_test.py
+++ b/tests/perf/adam_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -28,8 +31,7 @@ def _main():
     param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
     torch_time = _test_perf(param, torch.optim.Adam)
     ds_time = _test_perf(param, DeepSpeedCPUAdam)
-    #print(f"Step time: {torch_time=} {ds_time=}")
-    print("Step time: {torch_time=%s} {ds_time=%s}" %(torch_time, ds_time))
+    print(f"Step time: {torch_time=} {ds_time=}")
 
 
 _main()
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index 13d486d4d855826b22dc05221a8b71be1a977660..b35477afb4fe999ce9a2532b9f78a1a71636e8d2 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -8,10 +11,7 @@ from deepspeed.accelerator import get_accelerator
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(
-    torch.ones(model_size,
-               dtype=torch.half,
-               device=get_accelerator().device_name(0)))
+param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0)))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
diff --git a/tests/small_model_debugging/stage3_test.py b/tests/small_model_debugging/stage3_test.py
index ca85c00be486bcd24057a725a97dac72d8d59b46..3a92d31f1b7ad981598997615fc237393940bb8c 100644
--- a/tests/small_model_debugging/stage3_test.py
+++ b/tests/small_model_debugging/stage3_test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
@@ -10,6 +13,7 @@ import deepspeed
 
 
 class VerboseLinear(torch.nn.Linear):
+
     def __init__(self, **kwargs):
         print(f'Begin VerboseLinear.__init__')
         super().__init__(**kwargs)
@@ -17,21 +21,19 @@ class VerboseLinear(torch.nn.Linear):
 
 
 class LinearStack(torch.nn.Module):
+
     def __init__(self, input_dim=2, hidden_dim=4, output_dim=4, num_layers=2):
         super().__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
 
-        self.input_layer = VerboseLinear(in_features=self.input_dim,
-                                         out_features=self.hidden_dim)
+        self.input_layer = VerboseLinear(in_features=self.input_dim, out_features=self.hidden_dim)
         self.layers = torch.nn.ModuleList([
-            torch.nn.Linear(in_features=self.hidden_dim,
-                            out_features=self.hidden_dim,
-                            bias=False) for x in range(num_layers)
+            torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
+            for x in range(num_layers)
         ])
-        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
-                                            out_features=self.output_dim)
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
         self.identity = torch.nn.Identity()
 
     def forward(self, x):
diff --git a/tests/small_model_debugging/test.py b/tests/small_model_debugging/test.py
index a97792df56ac1d51273b57814b988eaa6e8b4df1..993fb6284a03a045f8ff4f863435981210246ce1 100644
--- a/tests/small_model_debugging/test.py
+++ b/tests/small_model_debugging/test.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from deepspeed.pt.deepspeed_linear import LinearModuleForZeroStage3
@@ -28,10 +31,7 @@ def see_memory_usage(message):
     )
 
 
-tens = torch.rand(1024,
-                  16384,
-                  dtype=torch.half,
-                  device=torch.device(get_accelerator().device_name()))
+tens = torch.rand(1024, 16384, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
 tens_back = tens.detach().clone()
 
 #linear_bk = torch.nn.functional.linear
@@ -45,9 +45,7 @@ y = model(tens)
 
 see_memory_usage("After forward")
 
-model.weight.data = torch.zeros(1,
-                                dtype=torch.half,
-                                device=torch.device(get_accelerator().device_name()))
+model.weight.data = torch.zeros(1, dtype=torch.half, device=torch.device(get_accelerator().device_name()))
 
 see_memory_usage("After weight zero")
 
diff --git a/tests/small_model_debugging/test_mics_config.py b/tests/small_model_debugging/test_mics_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e988f9844e3b7861341a5b4fe166419e91d316d4
--- /dev/null
+++ b/tests/small_model_debugging/test_mics_config.py
@@ -0,0 +1,123 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import json
+import argparse
+import torch
+import deepspeed
+from torch.utils.data.distributed import DistributedSampler
+import deepspeed.comm as dist
+
+
+class SimpleModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleModel, self).__init__()
+        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
+        if empty_grad:
+            self.layers2 = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim)])
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        hidden = x
+        hidden = self.linear(hidden)
+        return self.cross_entropy_loss(hidden, y)
+
+
+def create_config_from_dict(tmpdir, config_dict):
+    config_path = os.path.join(tmpdir, 'temp_config.json')
+    with open(config_path, 'w') as fd:
+        json.dump(config_dict, fd)
+    return config_path
+
+
+def get_data_loader(model, total_samples, hidden_dim, device):
+    batch_size = model.train_micro_batch_size_per_gpu()
+    train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
+    train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
+    sampler = DistributedSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
+    return train_loader
+
+
+def get_args(tmpdir, config_dict):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument('--zero', type=int, default=3)
+    args = parser.parse_args()  #args=''
+
+    config_dict["zero_optimization"]["stage"] = args.zero
+    # print('config_dict["zero_optimization"]', config_dict["zero_optimization"])
+    config_path = create_config_from_dict(tmpdir, config_dict)
+
+    args.deepspeed_config = config_path
+    return args
+
+
+def print0(msg):
+    if dist.get_rank() == 0:
+        print(msg, flush=True)
+
+
+rank = int(os.environ['RANK'])
+print('seed:', 2222 + rank)
+torch.random.manual_seed(2222 + rank)
+
+config_dict = {
+    "train_batch_size": 8,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015,
+        }
+    },
+    "fp16": {
+        "enabled": True,
+        "initial_scale_power": 15
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "reduce_bucket_size": 20,
+        "mics_shard_size": 4,
+        "mics_hierarchical_params_gather": True,
+        "stage3_model_persistence_threshold": 10
+    }
+}
+#        "initial_scale_power": 15
+args = get_args('/tmp/', config_dict)
+hidden_dim = 32
+
+# with deepspeed.zero.Init():
+model = SimpleModel(hidden_dim, empty_grad=False)
+# print('------> init model with deepspeed.zero.Init()')
+
+model, _, _, _ = deepspeed.initialize(args=args,
+                                      model=model,
+                                      model_parameters=model.parameters(),
+                                      dist_init_required=True)
+
+
+def print_params(tag, model):
+    if dist.get_rank() == 0:
+        for n, p in model.named_parameters():
+            print0("{} {}:{}".format(tag, n, p))
+
+
+data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
+#print_params('pre-train', model)
+for n, batch in enumerate(data_loader):
+    loss = model(batch[0], batch[1])
+    if dist.get_rank() == 0:
+        print("LOSS:", loss.item())
+    model.backward(loss)
+    model.step()
+    #print_params('step={}'.format(n), model)
+    if n == 5: break
diff --git a/tests/small_model_debugging/test_model.py b/tests/small_model_debugging/test_model.py
index 792d683ce47b7e3e03e8a3cd0941afb2564c6a85..586106140d0b9c6a88e9849ec570b2b992ee7d48 100755
--- a/tests/small_model_debugging/test_model.py
+++ b/tests/small_model_debugging/test_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import json
@@ -10,6 +13,7 @@ import deepspeed.comm as dist
 
 
 class SimpleModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(SimpleModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -33,14 +37,10 @@ def create_config_from_dict(tmpdir, config_dict):
 def get_data_loader(model, total_samples, hidden_dim, device):
     batch_size = model.train_micro_batch_size_per_gpu()
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=torch.half)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     sampler = DistributedSampler(train_dataset)
-    train_loader = torch.utils.data.DataLoader(train_dataset,
-                                               batch_size=batch_size,
-                                               sampler=sampler)
+    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
     return train_loader
 
 
@@ -82,19 +82,20 @@ config_dict = {
     },
     "zero_optimization": {
         "stage": 0,
-        "reduce_bucket_size": 20
+        "reduce_bucket_size": 20,
+        "stage3_model_persistence_threshold": 10
     }
 }
 #        "initial_scale_power": 15
 args = get_args('/tmp/', config_dict)
-hidden_dim = 4
+hidden_dim = 32
 
 model = SimpleModel(hidden_dim, empty_grad=False)
 
-model, _, _,_ = deepspeed.initialize(args=args,
-                                     model=model,
-                                     model_parameters=model.parameters(),
-                                     dist_init_required=True)
+model, _, _, _ = deepspeed.initialize(args=args,
+                                      model=model,
+                                      model_parameters=model.parameters(),
+                                      dist_init_required=True)
 
 
 def print_params(tag, model):
@@ -103,10 +104,7 @@ def print_params(tag, model):
             print0("{} {}:{}".format(tag, n, p))
 
 
-data_loader = get_data_loader(model=model,
-                              total_samples=1000,
-                              hidden_dim=hidden_dim,
-                              device=model.device)
+data_loader = get_data_loader(model=model, total_samples=1000, hidden_dim=hidden_dim, device=model.device)
 #print_params('pre-train', model)
 for n, batch in enumerate(data_loader):
     loss = model(batch[0], batch[1])
diff --git a/tests/test_deepspeed_v0.8.2.sh b/tests/test_deepspeed_v0.8.2.sh
new file mode 100755
index 0000000000000000000000000000000000000000..117236aa10a746cd30c8de4c0868dace7fb5ab78
--- /dev/null
+++ b/tests/test_deepspeed_v0.8.2.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+###该脚本基于test目录,即：pwd=path/to/ds/test
+#accelerator
+cd accelerator/
+pytest ./
+
+cd ..
+#unit模块
+pytest ./unit/profiling/flops_profiler/test_flops_profiler.py
+pytest unit/test_ds_config.py
+pytest unit/test_cpu_adam.py
+pytest unit/pipe/test_pipe_module.py
+pytest unit/autotuning/test_autotuning.py
+export ROCBLAS_COMPUTETYPE_FP16R=0 
+pytest unit/ops/accelerators/test_accelerator_forward.py
+pytest unit/ops/accelerators/test_accelerator_backward.py
+pytest unit/test_cuda_forward.py
+pytest unit/test_cuda_backward.py
+pytest unit/test_get_optim_files.py
+pytest unit/test_autotuning.py
+pytest unit/test_csr.py
+pytest unit/test_run.py
+pytest unit/comm/test_dist.py
+pytest unit/runtime/half_precision/test_fp16.py
+pytest unit/runtime/half_precision/test_bf16.py
+pytest unit/runtime/half_precision/onebit/test_onebit.py
+pytest unit/runtime/half_precision/test_dynamic_loss_scale.py
+pytest unit/runtime/test_ds_config_dict.py
+pytest unit/runtime/test_ds_config_model.py
+pytest unit/runtime/pipe/test_pipe.py
+pytest unit/runtime/pipe/test_topology.py
+pytest unit/runtime/pipe/test_pipe_schedule.py
+pytest unit/runtime/test_lr_schedulers.py
+pytest unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+pytest unit/runtime/test_ds_initialize.py
+pytest unit/runtime/test_pld.py
+pytest unit/runtime/test_runtime_utils.py
+pytest unit/runtime/zero/test_ignore_unused_parameters.py
+pytest unit/runtime/zero/test_zero_context_ancestry.py
+pytest unit/runtime/zero/test_zero_tensor_fragment.py
+pytest unit/runtime/zero/test_zero_context_return.py
+pytest unit/runtime/zero/test_zero.py
+pytest unit/runtime/zero/test_zero_context.py
+pytest unit/runtime/zero/test_zero_config.py
+pytest unit/runtime/zero/test_zero_tiled.py
+pytest unit/runtime/sparse_tensor/test_sparse_grads.py
+pytest unit/runtime/sparse_tensor/test_csr.py
+pytest unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+pytest unit/runtime/test_data_efficiency.py
+pytest unit/runtime/test_multi_output_model.py
+pytest unit/runtime/comm/test_coalesced_collectives.py
+pytest unit/runtime/test_autocast.py
+pytest unit/runtime/utils/test_partition.py
+pytest unit/runtime/test_data.py
+pytest unit/test_ds_arguments.py
+pytest unit/test_pipe_schedule.py
+pytest unit/moe/test_moe.py
+pytest unit/moe/test_moe_tp.py
+pytest unit/checkpoint/test_sparse.py
+pytest unit/checkpoint/test_zero_optimizer.py
+pytest unit/checkpoint/test_other_optimizer.py
+pytest unit/checkpoint/test_pipeline.py
+pytest unit/checkpoint/test_latest_checkpoint.py
+pytest unit/checkpoint/test_tag_validation.py
+pytest unit/checkpoint/test_moe_checkpoint.py
+pytest unit/checkpoint/test_reshape_checkpoint.py
+pytest unit/checkpoint/test_lr_scheduler.py
+pytest unit/test_autocast.py
+pytest unit/monitor/test_monitor.py
+pytest unit/utils/test_get_optim_files.py
+pytest unit/utils/test_init_on_device.py
+pytest unit/utils/test_groups.py
+pytest unit/test_cpu_adagrad.py
+pytest unit/ops/adagrad/test_cpu_adagrad.py
+pytest unit/ops/spatial/test_nhwc_bias_add.py
+pytest unit/ops/quantizer/test_quantize.py
+pytest unit/ops/quantizer/test_dequantize.py
+pytest unit/ops/quantizer/test_fake_quantization.py
+pytest unit/ops/aio/test_aio.py
+pytest unit/ops/adam/test_cpu_adam.py
+pytest unit/ops/adam/test_adamw.py
+pytest unit/ops/transformer/inference/test_bias_relu.py
+pytest unit/ops/transformer/inference/test_bias_add.py
+pytest unit/ops/transformer/inference/test_residual_add.py
+pytest unit/ops/transformer/inference/test_moe_res_matmult.py
+pytest unit/ops/transformer/inference/test_bias_gelu.py
+pytest unit/ops/transformer/inference/test_layer_norm.py
+pytest unit/ops/transformer/inference/test_bias_geglu.py
+pytest unit/launcher/test_multinode_runner.py
+pytest unit/launcher/test_run.py
+pytest unit/launcher/test_ds_arguments.py
+pytest unit/elasticity/test_elastic.py
+pytest unit/test_zero_tiled.py
+pytest unit/test_groups.py
+
+
diff --git a/tests/test_deepspeed_v0.9.2.sh b/tests/test_deepspeed_v0.9.2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e9235bb10de2dde476a0f367317cfa5d90427a8
--- /dev/null
+++ b/tests/test_deepspeed_v0.9.2.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+pytest ./lightning/test_simple.py
+pytest ./model/BingBertSquad/test_e2e_squad.py
+pytest ./model/Megatron_GPT2/test_common.py
+pytest ./onebit/test_nccl_backend.py
+pytest ./onebit/test_mpi_backend.py
+pytest ./onebit/test_mpi_perf.py
+pytest ./onebit/test_nccl_perf.py
+pytest ./small_model_debugging/test_mics_config.py
+pytest ./small_model_debugging/test_model.py
+pytest ./unit/autotuning/test_autotuning.py
+pytest ./unit/comm/test_dist.py
+pytest ./unit/compression/test_compression.py
+pytest ./unit/moe/test_moe_tp.py
+pytest ./unit/moe/test_moe.py
+pytest ./unit/monitor/test_monitor.py
+pytest ./unit/pipe/test_pipe_module.py
+pytest ./unit/profiling/flops_profiler/test_flops_profiler.py
+pytest ./unit/checkpoint/test_latest_checkpoint.py
+pytest ./unit/checkpoint/test_lr_scheduler.py
+pytest ./unit/checkpoint/test_moe_checkpoint.py
+pytest ./unit/checkpoint/test_other_optimizer.py
+pytest ./unit/checkpoint/test_reshape_checkpoint.py
+pytest ./unit/checkpoint/test_tag_validation.py
+pytest ./unit/checkpoint/test_pipeline.py
+pytest ./unit/checkpoint/test_sparse.py
+pytest ./unit/checkpoint/test_zero_optimizer.py
+pytest ./unit/elasticity/test_elastic.py
+pytest ./unit/inference/test_inference_config.py
+pytest ./unit/inference/test_checkpoint_sharding.py
+pytest ./unit/inference/test_inference.py
+pytest ./unit/inference/test_model_profiling.py
+pytest ./unit/launcher/test_ds_arguments.py
+pytest ./unit/launcher/test_multinode_runner.py
+pytest ./unit/launcher/test_run.py
+pytest ./unit/model_parallelism/test_configurable_parallel_mp.py
+pytest ./unit/model_parallelism/test_configurable_parallel_pp.py
+pytest ./unit/ops/adam/test_adamw.py
+pytest ./unit/ops/adam/test_cpu_adam.py
+pytest ./unit/ops/aio/test_aio.py
+pytest ./unit/ops/quantizer/test_fake_quantization.py
+pytest ./unit/ops/quantizer/test_quantize.py
+pytest ./unit/ops/spatial/test_nhwc_bias_add.py
+pytest ./unit/ops/transformer/inference/test_bias_add.py
+pytest ./unit/ops/transformer/inference/test_bias_geglu.py
+pytest ./unit/ops/transformer/inference/test_residual_add.py
+pytest ./unit/ops/transformer/inference/test_bias_gelu.py
+pytest ./unit/ops/transformer/inference/test_bias_relu.py
+pytest ./unit/ops/transformer/inference/test_layer_norm.py
+pytest ./unit/ops/transformer/inference/test_moe_res_matmult.py
+pytest ./unit/ops/accelerators/test_accelerator_backward.py
+pytest ./unit/ops/accelerators/test_accelerator_forward.py
+pytest ./unit/ops/adagrad/test_cpu_adagrad.py
+pytest ./unit/ops/sparse_attention/test_sparse_attention.py
+pytest ./unit/runtime/half_precision/onebit/test_onebit.py
+pytest ./unit/runtime/half_precision/test_dynamic_loss_scale.py
+pytest ./unit/runtime/half_precision/test_bf16.py
+pytest ./unit/runtime/half_precision/test_fp16.py
+pytest ./unit/runtime/pipe/test_pipe.py
+pytest ./unit/runtime/pipe/test_topology.py
+pytest ./unit/runtime/pipe/test_pipe_schedule.py
+pytest ./unit/runtime/test_data_efficiency.py
+pytest ./unit/runtime/test_ds_config_dict.py
+pytest ./unit/runtime/test_ds_initialize.py
+pytest ./unit/runtime/test_multi_output_model.py
+pytest ./unit/runtime/test_pld.py
+pytest ./unit/runtime/utils/test_partition.py
+pytest ./unit/runtime/zero/test_ignore_unused_parameters.py
+pytest ./unit/runtime/zero/test_zero.py
+pytest ./unit/runtime/zero/test_zero_config.py
+pytest ./unit/runtime/zero/test_zero_context_ancestry.py
+pytest ./unit/runtime/zero/test_zero_tensor_fragment.py
+pytest ./unit/runtime/zero/test_zero_context.py
+pytest ./unit/runtime/zero/test_zero_context_return.py
+pytest ./unit/runtime/zero/test_zero_dynamic_class.py
+pytest ./unit/runtime/zero/test_zero_nesting_init.py
+pytest ./unit/runtime/zero/test_zero_tiled.py
+pytest ./unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+pytest ./unit/runtime/comm/test_coalesced_collectives.py
+pytest ./unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+pytest ./unit/runtime/sparse_tensor/test_csr.py
+pytest ./unit/runtime/sparse_tensor/test_sparse_grads.py
+pytest ./unit/runtime/test_autocast.py
+pytest ./unit/runtime/test_data.py
+pytest ./unit/runtime/test_ds_config_model.py
+pytest ./unit/runtime/test_lr_schedulers.py
+pytest ./unit/runtime/test_runtime_utils.py
+pytest ./unit/utils/test_init_on_device.py
+pytest ./unit/utils/test_get_optim_files.py
+pytest ./unit/utils/test_groups.py
+pytest ./accelerator/test_ds_init.py
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
index fcb45ab2b68516814a4bfbffebf2e01cbfefd527..6c5067f71c8faf166bc78e88f9b62e8627dda7c7 100644
--- a/tests/unit/__init__.py
+++ b/tests/unit/__init__.py
@@ -1 +1,5 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index bdbaf02922e210cb47c46f17ef9da046ecf27ff1..7f9e37f289f031be7ed55f24ebd702b37f8ad3cb 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -12,41 +15,23 @@ from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
 
 
 class AlexNet(nn.Module):
+
     def __init__(self, num_classes=10):
         super(AlexNet, self).__init__()
         self.features = nn.Sequential(
-            nn.Conv2d(3,
-                      64,
-                      kernel_size=11,
-                      stride=4,
-                      padding=5),
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(64,
-                      192,
-                      kernel_size=5,
-                      padding=2),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(192,
-                      384,
-                      kernel_size=3,
-                      padding=1),
+            nn.MaxPool2d(kernel_size=2, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.Conv2d(384,
-                      256,
-                      kernel_size=3,
-                      padding=1),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.Conv2d(256,
-                      256,
-                      kernel_size=3,
-                      padding=1),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
             nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
+            nn.MaxPool2d(kernel_size=2, stride=2),
         )
         self.classifier = nn.Linear(256, num_classes)
         self.loss_fn = nn.CrossEntropyLoss()
@@ -59,12 +44,14 @@ class AlexNet(nn.Module):
 
 
 class AlexNetPipe(AlexNet):
+
     def to_layers(self):
         layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
         return layers
 
 
 class AlexNetPipeSpec(PipelineModule):
+
     def __init__(self, num_classes=10, **kwargs):
         self.num_classes = num_classes
         specs = [
@@ -81,9 +68,8 @@ class AlexNetPipeSpec(PipelineModule):
             LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
             F.relu,
             LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
-
             lambda x: x.view(x.size(0), -1),
-            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
+            LayerSpec(nn.Linear, 256, self.num_classes),  # classifier
         ]
         super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
 
@@ -99,12 +85,7 @@ def cifar_trainset(fp16=False):
 
     transform_list = [
         transforms.ToTensor(),
-        transforms.Normalize((0.5,
-                              0.5,
-                              0.5),
-                             (0.5,
-                              0.5,
-                              0.5)),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
     ]
     if fp16:
         transform_list.append(torchvision.transforms.Lambda(cast_to_half))
@@ -117,23 +98,14 @@ def cifar_trainset(fp16=False):
     dist.barrier()
     if local_rank != 0:
         dist.barrier()
-    trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data',
-                                            train=True,
-                                            download=True,
-                                            transform=transform)
+    trainset = torchvision.datasets.CIFAR10(root='/blob/cifar10-data', train=True, download=True, transform=transform)
     if local_rank == 0:
         dist.barrier()
     return trainset
 
 
-def train_cifar(model,
-                config,
-                num_steps=400,
-                average_dp_losses=True,
-                fp16=True,
-                seed=123):
-    with get_accelerator().random().fork_rng(
-            devices=[get_accelerator().current_device_name()]):
+def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
+    with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()]):
         ds_utils.set_random_seed(seed)
 
         # disable dropout
@@ -142,11 +114,10 @@ def train_cifar(model,
         trainset = cifar_trainset(fp16=fp16)
         config['local_rank'] = dist.get_rank()
 
-        engine, _, _, _ = deepspeed.initialize(
-            config=config,
-            model=model,
-            model_parameters=[p for p in model.parameters()],
-            training_data=trainset)
+        engine, _, _, _ = deepspeed.initialize(config=config,
+                                               model=model,
+                                               model_parameters=[p for p in model.parameters()],
+                                               training_data=trainset)
 
         losses = []
         for step in range(num_steps):
diff --git a/tests/unit/autotuning/test_autotuning.py b/tests/unit/autotuning/test_autotuning.py
index 90b9c5b3a2c8dca9a7fed7df884cd4bcce389c37..ea32c66624a838723a781a5ccda3520279a767ea 100644
--- a/tests/unit/autotuning/test_autotuning.py
+++ b/tests/unit/autotuning/test_autotuning.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
@@ -14,13 +17,11 @@ TUNE_OPTION = 'tune'
 def test_command_line():
     '''Validate handling of command line arguments'''
     for opt in [RUN_OPTION, TUNE_OPTION]:
-        dsrun.parse_args(
-            args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
+        dsrun.parse_args(args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
 
     for error_opts in [
             "--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(),
-            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(),
-            "--autotuning".split()
+            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(), "--autotuning".split()
     ]:
         with pytest.raises(SystemExit):
             dsrun.parse_args(args=error_opts)
@@ -65,18 +66,9 @@ def test_resource_manager_arg_mappings(arg_mappings):
                         ]
                         ) # yapf: disable
 def test_autotuner_resources(tmpdir, active_resources):
-    config_dict = {
-        "autotuning": {
-            "enabled": True,
-            "exps_dir": os.path.join(tmpdir,
-                                     'exps_dir'),
-            "arg_mappings": {}
-        }
-    }
+    config_dict = {"autotuning": {"enabled": True, "exps_dir": os.path.join(tmpdir, 'exps_dir'), "arg_mappings": {}}}
     config_path = create_config_from_dict(tmpdir, config_dict)
-    args = dsrun.parse_args(
-        args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split(
-        ))
+    args = dsrun.parse_args(args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split())
     tuner = Autotuner(args=args, active_resources=active_resources)
 
     expected_num_nodes = len(list(active_resources.keys()))
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index 5b89d6811b0120aaab2d9a031608c9d62e1aba69..8ade97dd18d3da6c5ea355d9b899b748b6dbceed 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -9,6 +12,7 @@ from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
 from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
 from unit.simple_model import *
 
@@ -22,38 +26,44 @@ def compare_deepspeed_states(saved_model, loaded_model):
     assert saved_model.global_steps == loaded_model.global_steps
 
 
-def compare_model_states(saved_model,
-                         loaded_model,
-                         compare_optimizer=True,
-                         load_module_only=False):
+def zero3_params_to_fetch(param_list):
+    return [p for p in param_list if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE]
+
+
+def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load_module_only=False):
     if not load_module_only:
         compare_deepspeed_states(saved_model, loaded_model)
 
-    for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
-        np0, p0 = p0
-        np1, p1 = p1
-        if 'deepspeed_moe.gate.wg' in np0:
-            # these params are converted to float at runtime, cast to half for comparison
-            p1 = p1.half()
-            p0 = p0.half()
-        assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
-        try:
-            assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
-        except RuntimeError as err:
-            print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
-            raise err
+    params_to_fetch = zero3_params_to_fetch(
+        list(saved_model.module.named_parameters()) + list(loaded_model.module.named_parameters()))
+    enable_gather = len(params_to_fetch) > 0
+    with deepspeed.zero.GatheredParameters(params_to_fetch, enabled=enable_gather):
+        for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
+            np0, p0 = p0
+            np1, p1 = p1
+            if 'deepspeed_moe.gate.wg' in np0:
+                # these params are converted to float at runtime, cast to half for comparison
+                p1 = p1.half()
+                p0 = p0.half()
+            assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
+            try:
+                assert torch.allclose(p0, p1,
+                                      atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
+            except RuntimeError as err:
+                print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
+                raise err
 
     if not compare_optimizer:
         return
 
-    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
-            saved_model.optimizer,
-            DeepSpeedZeroOptimizer_Stage3):
-        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
+    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer_Stage3):
+        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat,
+                          loaded_model.optimizer.fp32_partitioned_groups_flat):
             assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
     elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer):
-        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
+        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups,
+                          loaded_model.optimizer.single_partition_of_fp32_groups):
             assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
             assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
 
@@ -89,8 +99,7 @@ def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
     saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer
     loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer
 
-    for state0, state1 in zip(saved_optimizer.state.values(),
-                              loaded_optimizer.state.values()):
+    for state0, state1 in zip(saved_optimizer.state.values(), loaded_optimizer.state.values()):
         compare_state_dicts(state0, state1)
 
 
@@ -130,6 +139,7 @@ def create_deepspeed_model(config_dict, model, base_optimizer):
                                              model=model,
                                              model_parameters=create_moe_param_groups(model),
                                              optimizer=base_optimizer)
+    ds_model.empty_partition_cache()
     return ds_model
 
 
@@ -141,15 +151,12 @@ def checkpoint_correctness_verification(config_dict,
                                         load_lr_scheduler_states=False,
                                         fp16=True,
                                         train_batch=False,
-                                        base_optimizers=[None,
-                                                         None],
+                                        base_optimizers=[None, None],
                                         empty_tag=False,
                                         seq_dataloader=False,
                                         load_module_only=False):
     dtype = torch.half if fp16 else torch.float32
-    ds_model = create_deepspeed_model(config_dict=config_dict,
-                                      model=models[0],
-                                      base_optimizer=base_optimizers[0])
+    ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])
 
     if seq_dataloader:
         data_loader = sequence_dataloader(model=ds_model,
@@ -174,6 +181,9 @@ def checkpoint_correctness_verification(config_dict,
             ds_model.backward(loss)
             ds_model.step()
 
+    # Flush zero stage 3 cache
+    ds_model.empty_partition_cache()
+
     trained_model = ds_model
 
     save_folder = os.path.join(tmpdir, 'saved_checkpoint')
@@ -196,11 +206,8 @@ def checkpoint_correctness_verification(config_dict,
                 stored = sum(v for _, v in storages.items())
                 assert needed == stored, f"MoE expert checkpoint uses more storage than required: {f}"
 
-    loaded_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=models[1],
-                                          base_optimizer=base_optimizers[1])
-    assert list(trained_model.parameters())[0].dtype == list(
-        loaded_model.parameters())[0].dtype
+    loaded_model = create_deepspeed_model(config_dict=config_dict, model=models[1], base_optimizer=base_optimizers[1])
+    assert list(trained_model.parameters())[0].dtype == list(loaded_model.parameters())[0].dtype
 
     loaded_model.load_checkpoint(save_folder,
                                  tag=save_tag,
diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py
index 955edfdec3ac31ce63b3104fa1d7ddd4af3bdf91..e2d2f9db804320c06eeb834c2ee5bb0c320a8bb8 100644
--- a/tests/unit/checkpoint/test_latest_checkpoint.py
+++ b/tests/unit/checkpoint/test_latest_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
@@ -46,8 +49,6 @@ class TestLatestCheckpoint(DistributedTest):
         }
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         # should be no-op, since latest doesn't exist
         model.load_checkpoint(tmpdir)
diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py
index f6a8f5ebdd4ab21452a4b40bacd0b24e5be5d1fa..c4c6773cd4744d9af0223ab0937efc8aa5579d4f 100644
--- a/tests/unit/checkpoint/test_lr_scheduler.py
+++ b/tests/unit/checkpoint/test_lr_scheduler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 from deepspeed.ops.op_builder import CPUAdamBuilder
@@ -11,19 +14,8 @@ from unit.checkpoint.common import checkpoint_correctness_verification
 import pytest
 
 
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(0,
-                           False),
-                          (1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
+@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(0, False), (1, False), (2, False), (2, True), (3, False),
+                                                         (3, True)])
 class TestLRSchedulerCheckpoint(DistributedTest):
     world_size = 2
 
@@ -38,8 +30,7 @@ class TestLRSchedulerCheckpoint(DistributedTest):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
index edce2959aa203485b910a86b2bc452b86f1b078f..470a0d51f579b6a554acf69546f6d34069e16cfe 100644
--- a/tests/unit/checkpoint/test_moe_checkpoint.py
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
 
@@ -19,20 +22,10 @@ class TestMoECheckpoint(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
+        models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
         checkpoint_correctness_verification(config_dict,
                                             models=models,
@@ -45,15 +38,7 @@ class TestMoECheckpoint(DistributedTest):
                                             base_optimizers=optimizers,
                                             seq_dataloader=True)
 
-    @pytest.mark.parametrize("ep_size, load_optim_states",
-                             [(4,
-                               True),
-                              (4,
-                               False),
-                              (2,
-                               True),
-                              (2,
-                               False)])
+    @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)])
     def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
@@ -65,8 +50,7 @@ class TestMoECheckpoint(DistributedTest):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -81,21 +65,11 @@ class TestMoECheckpoint(DistributedTest):
         }
         hidden_dim = 16
 
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
+        models = [SimpleMoEModel(hidden_dim=hidden_dim, num_experts=ep_size, ep_size=ep_size) for _ in range(2)]
         # param group must have a random unique name (for now)
         # TODO: clean-up this requirement, the unique name should not be required here
-        param_groups = [{
-            'params': [p for p in model.parameters()],
-            'name': 'random-unique-name'
-        } for model in models]
-        params = [
-            split_params_into_different_moe_groups_for_optimizer(group)
-            for group in param_groups
-        ]
+        param_groups = [{'params': [p for p in model.parameters()], 'name': 'random-unique-name'} for model in models]
+        params = [split_params_into_different_moe_groups_for_optimizer(group) for group in param_groups]
         optimizers = [torch.optim.AdamW(params=param) for param in params]
         checkpoint_correctness_verification(config_dict,
                                             models=models,
diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py
index d09157a2c80d8b4e82a72a886a85c2bafb7fcc55..9cb8c42868801228e097cd4fc603bccb99fd8892 100644
--- a/tests/unit/checkpoint/test_other_optimizer.py
+++ b/tests/unit/checkpoint/test_other_optimizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 from deepspeed.ops.op_builder import FusedLambBuilder
@@ -14,8 +17,7 @@ import pytest
 class TestOtherOptimizerCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
-                        reason="lamb is not compatible")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test_checkpoint_unfused_optimizer(self, tmpdir):
         config_dict = {
             "train_batch_size": 2,
@@ -74,8 +76,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
                 "type": "Adam",
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -111,8 +112,7 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
                 "type": "Adam",
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py
index c698798fa96564b5f82dcff937fe2046073aa8ce..99f1ba2ec43388c170cf88f904431abd4b282b84 100644
--- a/tests/unit/checkpoint/test_pipeline.py
+++ b/tests/unit/checkpoint/test_pipeline.py
@@ -1,10 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 from unit.common import DistributedTest
 from unit.simple_model import *
-
 from unit.checkpoint.common import checkpoint_correctness_verification
+from unit.util import skip_on_arch
 
 import pytest
 
@@ -14,6 +17,8 @@ class TestPipelineCheckpoint(DistributedTest):
 
     @pytest.mark.parametrize("zero_stage", [0, 1])
     def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
+        skip_on_arch(min_arch=7)
+
         config_dict = {
             "train_batch_size": 2,
             "train_micro_batch_size_per_gpu": 1,
diff --git a/tests/unit/checkpoint/test_reshape_checkpoint.py b/tests/unit/checkpoint/test_reshape_checkpoint.py
index c9ae854521bab48e113b8ba90c67d8e5551fe2e7..41ccd37b360261a1bc1f192bcf9777d59288b4de 100644
--- a/tests/unit/checkpoint/test_reshape_checkpoint.py
+++ b/tests/unit/checkpoint/test_reshape_checkpoint.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.checkpoint import model_3d_desc
 
diff --git a/tests/unit/checkpoint/test_sparse.py b/tests/unit/checkpoint/test_sparse.py
index 4f07acebc058f24e6d26ff25749190f334c380c9..19fbcd81e4736221ce1604edf91af86eac2d1988 100644
--- a/tests/unit/checkpoint/test_sparse.py
+++ b/tests/unit/checkpoint/test_sparse.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
@@ -11,33 +14,21 @@ import pytest
 class TestSparseCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.parametrize(["to_save_model_has_embedding",
-                              "to_save_model_sparse"],
-                             [
-                                 [False,
-                                  False],
-                                 [True,
-                                  False],
-                                 [True,
-                                  True],
-                             ])
-    @pytest.mark.parametrize(["destination_has_embedding",
-                              "destination_sparse"],
-                             [
-                                 [False,
-                                  False],
-                                 [True,
-                                  False],
-                                 [True,
-                                  True],
-                             ])
-    def test_non_strict_load_sparse(self,
-                                    tmpdir,
-                                    to_save_model_has_embedding,
-                                    to_save_model_sparse,
-                                    destination_has_embedding,
-                                    destination_sparse):
+    @pytest.mark.parametrize(["to_save_model_has_embedding", "to_save_model_sparse"], [
+        [False, False],
+        [True, False],
+        [True, True],
+    ])
+    @pytest.mark.parametrize(["destination_has_embedding", "destination_sparse"], [
+        [False, False],
+        [True, False],
+        [True, True],
+    ])
+    def test_non_strict_load_sparse(self, tmpdir, to_save_model_has_embedding, to_save_model_sparse,
+                                    destination_has_embedding, destination_sparse):
+
         class ModelNoEmbedding(torch.nn.Module):
+
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(3, 1)
@@ -46,6 +37,7 @@ class TestSparseCheckpoint(DistributedTest):
                 return self.linear(x)
 
         class ModelEmbedding(torch.nn.Module):
+
             def __init__(self):
                 super().__init__()
                 self.emb = torch.nn.Embedding(10, 3)
@@ -63,22 +55,24 @@ class TestSparseCheckpoint(DistributedTest):
         else:
             model_destination = ModelNoEmbedding()
 
-        engine_to_save, _, _, _ = deepspeed.initialize(
-            model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse}
-        )
-        engine_destination, _, _, _ = deepspeed.initialize(
-            model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse}
-        )
+        engine_to_save, _, _, _ = deepspeed.initialize(model=model_to_save,
+                                                       config={
+                                                           "train_batch_size": 2,
+                                                           "sparse_gradients": to_save_model_sparse
+                                                       })
+        engine_destination, _, _, _ = deepspeed.initialize(model=model_destination,
+                                                           config={
+                                                               "train_batch_size": 2,
+                                                               "sparse_gradients": destination_sparse
+                                                           })
 
         save_folder = os.path.join(tmpdir, 'saved_checkpoint')
         save_tag = '1'
 
         engine_to_save.save_checkpoint(save_folder, tag=save_tag)
 
-        is_sparse_destination = isinstance(model_destination,
-                                           ModelEmbedding) and destination_sparse
-        if isinstance(model_destination,
-                      ModelEmbedding) and model_destination.emb.sparse:
+        is_sparse_destination = isinstance(model_destination, ModelEmbedding) and destination_sparse
+        if isinstance(model_destination, ModelEmbedding) and model_destination.emb.sparse:
             assert "emb.weight" in engine_destination.sparse_tensor_module_names
         engine_destination.load_checkpoint(save_folder,
                                            tag=save_tag,
@@ -86,9 +80,7 @@ class TestSparseCheckpoint(DistributedTest):
                                            load_optimizer_states=False,
                                            load_lr_scheduler_states=False,
                                            load_module_only=False)
-        if isinstance(model_destination,
-                      ModelEmbedding) and isinstance(model_to_save,
-                                                     ModelEmbedding):
+        if isinstance(model_destination, ModelEmbedding) and isinstance(model_to_save, ModelEmbedding):
             assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names
         elif isinstance(model_destination, ModelEmbedding):
             assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names
diff --git a/tests/unit/checkpoint/test_tag_validation.py b/tests/unit/checkpoint/test_tag_validation.py
index d9489622305d60e72a49e355979550e5616004d1..b164c31e52b046d596e493bfc0643a234a081dd7 100644
--- a/tests/unit/checkpoint/test_tag_validation.py
+++ b/tests/unit/checkpoint/test_tag_validation.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 
@@ -29,9 +32,7 @@ class TestCheckpointValidationTag(DistributedTest):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         if valid_mode == "FAIL":
             with pytest.raises(AssertionError):
                 model.save_checkpoint(save_dir=tmpdir, tag=f"tag-{dist.get_rank()}")
@@ -58,6 +59,4 @@ class TestCheckpointValidationTag(DistributedTest):
         model = SimpleModel(hidden_dim)
 
         with pytest.raises(deepspeed.DeepSpeedConfigError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                model=model,
-                                                model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 7de8e9bff90825d69eb67e5afe6fdfdbdeb800b7..2325f9cf05f70a156a19a873b7d5437bf127cbc0 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 from deepspeed.ops.op_builder import CPUAdamBuilder
@@ -15,27 +18,11 @@ import pytest
 class TestZeROCheckpoint(DistributedTest):
     world_size = 2
 
-    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                             [(1,
-                               False,
-                               'Adam'),
-                              (2,
-                               False,
-                               'Adam'),
-                              (2,
-                               True,
-                               'deepspeed_adam'),
-                              (3,
-                               False,
-                               'Adam'),
-                              (3,
-                               True,
-                               'deepspeed_adam')])
-    def test_load_optimizer_state(self,
-                                  tmpdir,
-                                  zero_stage,
-                                  use_cpu_offload,
-                                  adam_optimizer):
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, 'Adam'), (2, False, 'Adam'),
+                                                                             (2, True, 'deepspeed_adam'),
+                                                                             (3, False, 'Adam'),
+                                                                             (3, True, 'deepspeed_adam')])
+    def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -46,8 +33,7 @@ class TestZeROCheckpoint(DistributedTest):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -70,33 +56,13 @@ class TestZeROCheckpoint(DistributedTest):
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
 
-    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                             [(1,
-                               False,
-                               "Adam"),
-                              (2,
-                               False,
-                               "Adam"),
-                              (2,
-                               True,
-                               'deepspeed_adam'),
-                              (3,
-                               False,
-                               'Adam'),
-                              (3,
-                               True,
-                               'deepspeed_adam')])
-    def test_not_load_optimizer_state(self,
-                                      tmpdir,
-                                      zero_stage,
-                                      use_cpu_offload,
-                                      adam_optimizer):
+    @pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer', [(1, False, "Adam"), (2, False, "Adam"),
+                                                                             (2, True, 'deepspeed_adam'),
+                                                                             (3, False, 'Adam'),
+                                                                             (3, True, 'deepspeed_adam')])
+    def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -107,8 +73,7 @@ class TestZeROCheckpoint(DistributedTest):
                 "type": 'Adam',
                 "params": {
                     "lr": 0.00015,
-                    "betas": [0.8,
-                              0.999],
+                    "betas": [0.8, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -131,11 +96,7 @@ class TestZeROCheckpoint(DistributedTest):
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=False)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
 
     @pytest.mark.parametrize('zero_stage', [1, 2])
     def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
@@ -186,11 +147,7 @@ class TestZeROCheckpoint(DistributedTest):
         else:
             models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
 
-        checkpoint_correctness_verification(config_dict,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_module_only=True)
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
 
 
 class ws4_model_checkpoint(DistributedFixture):
@@ -214,22 +171,15 @@ class ws4_model_checkpoint(DistributedFixture):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=model,
-                                            model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
         if load_optim:
-            torch.save(model.optimizer.optimizer.state_dict(),
-                       os.path.join(class_tmpdir,
-                                    'opt-state-dict'))
+            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(class_tmpdir, 'opt-state-dict'))
         model.save_checkpoint(class_tmpdir)
 
 
@@ -239,11 +189,7 @@ class ws4_model_checkpoint(DistributedFixture):
 class TestZeROElasticCheckpoint(DistributedTest):
     world_size = 2
 
-    def test_elastic_checkpoint_fixed_dp(self,
-                                         tmpdir,
-                                         elastic_save,
-                                         elastic_load,
-                                         load_optim):
+    def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, load_optim):
         ds_config = {
             "train_batch_size": 2,
             "optimizer": {
@@ -263,54 +209,39 @@ class TestZeROElasticCheckpoint(DistributedTest):
         # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
         # false positive mismatches in checkpoint state comparisons.
         # Newer torch versions store tensor ids as 0, 1, 2, ...
-        expected_mismatch_keys = [] if required_minimum_torch_version(1,
-                                                                      4) else ['params']
+        expected_mismatch_keys = [] if required_minimum_torch_version(1, 4) else ['params']
         models = [SimpleModel(hidden_dim) for _ in range(2)]
         model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=models[0],
-                                            model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+                                              model=models[0],
+                                              model_parameters=models[0].parameters())
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
         if load_optim:
-            torch.save(model.optimizer.optimizer.state_dict(),
-                       os.path.join(tmpdir,
-                                    'opt-state-dict'))
+            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, 'opt-state-dict'))
         model.save_checkpoint(tmpdir)
 
         ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
         model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                            model=models[1],
-                                            model_parameters=models[1].parameters())
+                                              model=models[1],
+                                              model_parameters=models[1].parameters())
         model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
 
         if load_optim:
             saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict'))
             curr_sd = model.optimizer.optimizer.state_dict()
             for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
-                compare_state_dicts(curr_param_group,
-                                    saved_param_group,
-                                    expected_mismatch_keys)
+                compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys)
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
-    def test_elastic_checkpoint_change_dp(self,
-                                          ws4_model_checkpoint,
-                                          class_tmpdir,
-                                          elastic_save,
-                                          elastic_load,
+    def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir, elastic_save, elastic_load,
                                           load_optim):
         ds_config = {
             "train_batch_size": 4,
@@ -330,9 +261,7 @@ class TestZeROElasticCheckpoint(DistributedTest):
         model = SimpleModel(hidden_dim)
 
         # Load checkpoint with dp world size = 2
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                                model=model,
-                                                model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
         if load_optim:
             with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
                 model.load_checkpoint(class_tmpdir, load_optimizer_states=load_optim)
@@ -361,9 +290,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         ds_model.save_checkpoint(tmpdir)
         ds_model.load_checkpoint(tmpdir,
                                  load_optimizer_states=False,
@@ -390,9 +317,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest):
 
         # 1. pretrain a model and save it
         dtype = torch.half
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         data_loader = random_dataloader(model=ds_model,
                                         total_samples=1,
                                         hidden_dim=hidden_dim,
@@ -402,12 +327,12 @@ class TestZeROSaveLoadEdgeCase(DistributedTest):
             loss = ds_model(batch[0], batch[1])
             ds_model.backward(loss)
             ds_model.step()
+
+        ds_model.empty_partition_cache()
         ds_model.save_checkpoint(tmpdir)
 
         # 2. load and immediately save a model with a fresh ds engine
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
         ds_model.load_checkpoint(tmpdir,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=False,
@@ -438,9 +363,7 @@ class TestZeROSaveLoadEdgeCase(DistributedTest):
         # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
         # cycle was completed.
         # So we config grad_accum=2 and step only once and save_16bit_model
-        ds_model = create_deepspeed_model(config_dict=config_dict,
-                                          model=model,
-                                          base_optimizer=None)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
 
         data_loader = random_dataloader(model=ds_model,
                                         total_samples=2,
@@ -453,8 +376,96 @@ class TestZeROSaveLoadEdgeCase(DistributedTest):
         ds_model.backward(loss)
         ds_model.step()
 
+        ds_model.empty_partition_cache()
+
         # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete
         ds_model.save_16bit_model(tmpdir, "model.pt")
 
         # let's test just as well that we can save the checkpoint too
         ds_model.save_checkpoint(tmpdir)
+
+
+class TestZeROCheckpointFrozenWeights(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_load_optimizer_state(self, tmpdir, zero_stage):
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8, 0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "wall_clock_breakdown": True,
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_not_load_optimizer_state(self, tmpdir, zero_stage):
+
+        config_dict = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": 'Adam',
+                "params": {
+                    "lr": 0.00015,
+                    "betas": [0.8, 0.999],
+                    "eps": 1e-8,
+                    "weight_decay": 3e-7
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    def test_load_module_only(self, tmpdir, zero_stage):
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": 'Adam'
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(enabled=zero_stage == 3):
+            models = [SimpleFrozenModel(hidden_dim, empty_grad=False) for _ in range(2)]
+
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_module_only=True)
diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py
index 6005c926f793b73ed61219c39e0de194d9755ee6..b98a1f52e874441020ca44ca51f702eb8c9085c7 100644
--- a/tests/unit/comm/test_dist.py
+++ b/tests/unit/comm/test_dist.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -31,6 +34,7 @@ def greeting(request):
 class TestDistArgs(DistributedTest):
     world_size = 2
     """ Classes that use DistributedTest class must define a test* method """
+
     @pytest.mark.parametrize("shape", ["icosahedron"])
     def test(self, number, color, shape, greeting):
         """Ensure that we can parse args to DistributedTest methods. """
@@ -118,8 +122,7 @@ class TestDistInit(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group(
-            get_accelerator().communication_backend_name())
+        torch.distributed.init_process_group(get_accelerator().communication_backend_name())
         deepspeed.init_distributed(get_accelerator().communication_backend_name(),
                                    dist_init_required=dist_init_required)
 
@@ -130,9 +133,8 @@ class TestDistInit(DistributedTest):
         else:
             # torch.dist is not done and for some reason the user says they don't want it done
             with pytest.raises(Exception):
-                deepspeed.init_distributed(
-                    get_accelerator().communication_backend_name(),
-                    dist_init_required=dist_init_required)
+                deepspeed.init_distributed(get_accelerator().communication_backend_name(),
+                                           dist_init_required=dist_init_required)
 
 
 class TestDistInitNoEnv(DistributedTest):
@@ -141,14 +143,12 @@ class TestDistInitNoEnv(DistributedTest):
     set_dist_env = False
 
     def test(self):
-        torch.distributed.init_process_group(
-            backend=get_accelerator().communication_backend_name(),
-            init_method=f"tcp://127.0.0.1:{get_master_port()}",
-            world_size=1,
-            rank=0)
+        torch.distributed.init_process_group(backend=get_accelerator().communication_backend_name(),
+                                             init_method=f"tcp://127.0.0.1:{get_master_port()}",
+                                             world_size=1,
+                                             rank=0)
         assert torch.distributed.is_initialized()
-        deepspeed.init_distributed(get_accelerator().communication_backend_name(),
-                                   auto_mpi_discovery=True)
+        deepspeed.init_distributed(get_accelerator().communication_backend_name(), auto_mpi_discovery=True)
 
 
 @pytest.mark.parametrize("dist_init_required", [True, False])
@@ -156,45 +156,26 @@ class TestDistInitWithModel(DistributedTest):
     init_distributed = False
 
     def test_already_init(self, dist_init_required):
-        torch.distributed.init_process_group(
-            get_accelerator().communication_backend_name())
+        torch.distributed.init_process_group(get_accelerator().communication_backend_name())
         model = SimpleModel(4)
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {}
-            }
-        }
-        engine, *_ = deepspeed.initialize(
-            model=model,
-            config=config_dict,
-            model_parameters=model.parameters(),
-            dist_init_required=dist_init_required
-        )
+        config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}}
+        engine, *_ = deepspeed.initialize(model=model,
+                                          config=config_dict,
+                                          model_parameters=model.parameters(),
+                                          dist_init_required=dist_init_required)
 
     def test_no_init(self, dist_init_required):
         model = SimpleModel(4)
-        config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {}
-            }
-        }
+        config_dict = {"train_micro_batch_size_per_gpu": 1, "optimizer": {"type": "Adam", "params": {}}}
         if dist_init_required:
-            engine, *_ = deepspeed.initialize(
-                model=model,
-                config=config_dict,
-                model_parameters=model.parameters(),
-                dist_init_required=dist_init_required
-            )
+            engine, *_ = deepspeed.initialize(model=model,
+                                              config=config_dict,
+                                              model_parameters=model.parameters(),
+                                              dist_init_required=dist_init_required)
         else:
             # torch.dist is not done and for some reason the user says they don't want it done
             with pytest.raises(Exception):
-                engine, *_ = deepspeed.initialize(
-                    model=model,
-                    config=config_dict,
-                    model_parameters=model.parameters(),
-                    dist_init_required=dist_init_required
-                )
+                engine, *_ = deepspeed.initialize(model=model,
+                                                  config=config_dict,
+                                                  model_parameters=model.parameters(),
+                                                  dist_init_required=dist_init_required)
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 35e8f3983072f0d858b15c855d554035cf0e8be9..5eca38cc83f86e66a8f9ca60f64be2b54a492d8b 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import time
@@ -49,12 +52,10 @@ def set_accelerator_visible():
         # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead
         import subprocess
         if get_accelerator().device_name() == 'cuda':
-            is_rocm_pytorch = hasattr(torch.version,
-                                      'hip') and torch.version.hip is not None
+            is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
             if is_rocm_pytorch:
                 rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
-                gpu_ids = filter(lambda s: 'GPU' in s,
-                                 rocm_smi.decode('utf-8').strip().split('\n'))
+                gpu_ids = filter(lambda s: 'GPU' in s, rocm_smi.decode('utf-8').strip().split('\n'))
                 num_gpus = len(list(gpu_ids))
             else:
                 nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
@@ -124,9 +125,9 @@ class DistributedExec(ABC):
         return fixture_kwargs
 
     def _launch_procs(self, num_procs):
-        if torch.cuda.is_available() and torch.cuda.device_count() < num_procs:
+        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
             pytest.skip(
-                f"Skipping test because not enough GPUs are available: {num_procs} required, {torch.cuda.device_count()} available"
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
             )
         mp.set_start_method('forkserver', force=True)
         skip_msg = mp.Queue()  # Allows forked processes to share pytest.skip reason
@@ -164,11 +165,9 @@ class DistributedExec(ABC):
                 p.terminate()
                 pytest.fail(f'Worker {rank} hung.', pytrace=False)
             if p.exitcode < 0:
-                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}',
-                            pytrace=False)
+                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False)
             if p.exitcode > 0:
-                pytest.fail(f'Worker {rank} exited with code {p.exitcode}',
-                            pytrace=False)
+                pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False)
 
         if not skip_msg.empty():
             # This assumed all skip messages are the same, it may be useful to
@@ -272,9 +271,7 @@ class DistributedFixture(DistributedExec):
     def __init__(self):
         assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures"
         self.__name__ = type(self).__name__
-        _pytestfixturefunction = FixtureFunctionMarker(scope="function",
-                                                       params=None,
-                                                       name=self.__name__)
+        _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None, name=self.__name__)
 
 
 class DistributedTest(DistributedExec):
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
index 7d1f02e771cbdc486803786e7b3f31b4e95ad6f9..5e3097339ee986e195efab8b17c646a6e1ee623a 100644
--- a/tests/unit/compression/test_compression.py
+++ b/tests/unit/compression/test_compression.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -12,12 +15,10 @@ from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParall
 from deepspeed.compression.helper import convert_conv1d_to_linear
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
+from unit.util import required_minimum_torch_version, required_maximum_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=5),
+                                reason='Megatron-LM package requires Pytorch version 1.5 or above')
 
 
 def reset_random(seed=1234):
@@ -73,6 +74,7 @@ class Conv1D(torch.nn.Module):
         nf (`int`): The number of output features.
         nx (`int`): The number of input features.
     """
+
     def __init__(self, nf, nx):
         super().__init__()
         self.nf = nf
@@ -95,6 +97,7 @@ def create_conv1d_model():
 
 
 class TestCompression(DistributedTest):
+
     def setup_method(self, method):
         reset_random()
 
@@ -132,8 +135,7 @@ class TestCompression(DistributedTest):
                                 "target_bits": 8,
                                 "quantization_period": 50
                             },
-                            "modules": ["attention.self",
-                                        "intermediate"]
+                            "modules": ["attention.self", "intermediate"]
                         },
                         "wq2": {
                             "params": {
@@ -205,9 +207,7 @@ class TestCompression(DistributedTest):
                                 "dense_ratio": 0.5
                             },
                             "modules": ["attention.output.dense"],
-                            "related_modules": [["self.query",
-                                                 "self.key",
-                                                 "self.value"]]
+                            "related_modules": [["self.query", "self.key", "self.value"]]
                         }
                     }
                 }
@@ -220,17 +220,14 @@ class TestCompression(DistributedTest):
         model = create_bert_model()
         compressed_model = init_compression(model, self.get_ds_config())
 
-        assert isinstance(compressed_model.layer[0].attention.self.query,
-                          LinearLayer_Compress)
-        assert isinstance(compressed_model.layer[0].attention.self.key,
-                          LinearLayer_Compress)
-        assert isinstance(compressed_model.layer[0].attention.self.value,
-                          LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.query, LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.key, LinearLayer_Compress)
+        assert isinstance(compressed_model.layer[0].attention.self.value, LinearLayer_Compress)
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_mpu_compress(self, tmpdir):
-        #from megatron import mpu
-        import sys
-        sys.path.append(r"/home/aishsh/megatron-lm")
+        if not required_maximum_torch_version(major_version=1, minor_version=13):
+            pytest.skip("megatron not compatible with torch >1.13")
         from megatron import mpu
         args_defaults = {
             'num_layers': 2,
@@ -242,21 +239,14 @@ class TestCompression(DistributedTest):
         model = get_gpt2_model(args_defaults)
         compressed_model = init_compression(model, self.get_ds_config(), mpu=mpu)
 
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].attention.
-            query_key_value,
-            ColumnParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].attention.dense,
-            RowParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].mlp.
-            dense_h_to_4h,
-            ColumnParallelLinear_Compress)
-        assert isinstance(
-            compressed_model.module.language_model.transformer.layers[0].mlp.
-            dense_4h_to_h,
-            RowParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.query_key_value,
+                          ColumnParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].attention.dense,
+                          RowParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_h_to_4h,
+                          ColumnParallelLinear_Compress)
+        assert isinstance(compressed_model.module.language_model.transformer.layers[0].mlp.dense_4h_to_h,
+                          RowParallelLinear_Compress)
 
     def test_conv1d_convertion(self, tmpdir):
         model = create_conv1d_model()
diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py
index e29b2a22e825b900ccb86dcd818da781ac185b41..2cd76c8c4fce5fc2ce87929c6e07988a800d9201 100644
--- a/tests/unit/elasticity/test_elastic.py
+++ b/tests/unit/elasticity/test_elastic.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed
@@ -14,10 +17,7 @@ def ds_config():
         "elasticity": {
             "enabled": True,
             "max_train_batch_size": 10000,
-            "micro_batch_sizes": [8,
-                                  12,
-                                  16,
-                                  17],
+            "micro_batch_sizes": [8, 12, 16, 17],
             "min_gpus": 32,
             "max_gpus": 1500,
             "min_time": 20,
@@ -28,9 +28,8 @@ def ds_config():
 
 
 def test_basic_10k(ds_config):
-    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version)
+    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                               target_deepspeed_version=ds_version)
 
     for gpu_num in valid_gpus:
         assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
@@ -49,61 +48,51 @@ def test_basic_10k(ds_config):
 
 def test_old_version(ds_config):
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version="0.2")
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                                   target_deepspeed_version="0.2")
 
 
 def test_disabled(ds_config):
     ds_config['elasticity']['enabled'] = False
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version)
+        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
+                                                                                   target_deepspeed_version=ds_version)
 
 
 def test_valid_world_size(ds_config):
     final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=64)
+        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=64)
     assert mbsize == 17
 
 
 def test_invalid_world_size(ds_config):
     with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
         final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=128)
+            ds_config=ds_config, target_deepspeed_version=ds_version, world_size=128)
 
 
 def test_future_elastic_version(ds_config):
     ds_config['elasticity']['version'] = '0.3'
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_missing_max_batch(ds_config):
     del ds_config['elasticity']['max_train_batch_size']
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_missing_micro_batch(ds_config):
     del ds_config['elasticity']['micro_batch_sizes']
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_empty_config():
     ds_config = {"elasticity": {"enabled": True}}
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_model_parallel_v1_invalid(ds_config):
@@ -112,8 +101,7 @@ def test_model_parallel_v1_invalid(ds_config):
     ds_config["elasticity"]["version"] = 0.1
 
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_model_parallel_v2_invalid(ds_config):
@@ -133,37 +121,17 @@ def test_model_parallel_v2_valid(ds_config):
     ds_config["elasticity"]["version"] = 0.2
 
     os.environ["WORLD_SIZE"] = str(16)
-    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                target_deepspeed_version=ds_version)
+    deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
     os.environ.pop("WORLD_SIZE")
 
 
-@pytest.mark.parametrize('key, value',
-                         [('micro_batch_sizes',
-                           [1,
-                            4,
-                            -1,
-                            2,
-                            -10]),
-                          ('min_gpus',
-                           -1),
-                          ('max_gpus',
-                           -1),
-                          ('micro_batch_sizes',
-                           5),
-                          ('micro_batch_sizes',
-                           ['a',
-                            None,
-                            0.5]),
-                          ('micro_batch_sizes',
-                           [2,
-                            0.5,
-                            4])])
+@pytest.mark.parametrize('key, value', [('micro_batch_sizes', [1, 4, -1, 2, -10]), ('min_gpus', -1), ('max_gpus', -1),
+                                        ('micro_batch_sizes', 5), ('micro_batch_sizes', ['a', None, 0.5]),
+                                        ('micro_batch_sizes', [2, 0.5, 4])])
 def test_invalid_config_values(key, value, ds_config):
     ds_config['elasticity'][key] = value
     with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
+        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
 
 
 def test_proper_mbsz(ds_config):
@@ -171,9 +139,7 @@ def test_proper_mbsz(ds_config):
     ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
     ds_config["elasticity"]["min_gpus"] = 1
     final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version,
-        world_size=7)
+        ds_config=ds_config, target_deepspeed_version=ds_version, world_size=7)
     assert mbsize == 3
 
 
@@ -194,10 +160,7 @@ class TestNonElasticBatchParams(DistributedTest):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -209,9 +172,7 @@ class TestNonElasticBatchParams(DistributedTest):
         model = SimpleModel(hidden_dim, empty_grad=False)
 
         with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
 
 class TestNonElasticBatchParamsWithOverride(DistributedTest):
@@ -231,10 +192,7 @@ class TestNonElasticBatchParamsWithOverride(DistributedTest):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -245,9 +203,7 @@ class TestNonElasticBatchParamsWithOverride(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
 
 class TestElasticConfigChanged(DistributedTest):
@@ -267,10 +223,7 @@ class TestElasticConfigChanged(DistributedTest):
             "elasticity": {
                 "enabled": True,
                 "max_train_batch_size": 4,
-                "micro_batch_sizes": [1,
-                                      2,
-                                      3,
-                                      4],
+                "micro_batch_sizes": [1, 2, 3, 4],
                 "min_gpus": 1,
                 "max_gpus": 4,
                 "min_time": 20,
@@ -287,6 +240,4 @@ class TestElasticConfigChanged(DistributedTest):
         model = SimpleModel(hidden_dim, empty_grad=False)
 
         with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
diff --git a/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py
similarity index 72%
rename from inference/test_checkpoint_sharding.py
rename to tests/unit/inference/test_checkpoint_sharding.py
index 09fb023072f98fa671616fde876d4b2fe7f0fc8a..611e6fc69edf758695d91da2aa6f0aa2525e57e0 100644
--- a/inference/test_checkpoint_sharding.py
+++ b/tests/unit/inference/test_checkpoint_sharding.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
@@ -6,12 +9,11 @@ import torch
 import deepspeed
 from deepspeed.model_implementations import DeepSpeedTransformerInference
 from unit.common import DistributedTest, DistributedFixture
-#from deepspeed.ops.transformers import AutoConfig, AutoModelForCausalLM
-#from deepspeed.ops.transformers import AutoConfig, AutoModelForCausalLM
-from ../ops.transformers import AutoConfig, AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM
 
 
 def check_dtype(model, expected_dtype):
+
     def find_dtype(module):
         for child in module.children():
             if isinstance(child, DeepSpeedTransformerInference):
@@ -23,17 +25,11 @@ def check_dtype(model, expected_dtype):
 
     found_dtype = find_dtype(model)
     assert found_dtype, "Did not find DeepSpeedTransformerInference in model"
-    assert (
-        found_dtype == expected_dtype
-    ), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}"
+    assert (found_dtype == expected_dtype), f"Expected transformer dtype {expected_dtype}, but found {found_dtype}"
 
 
-@pytest.fixture(params=[
-    "bigscience/bloom-560m",
-    "EleutherAI/gpt-j-6B",
-    "EleutherAI/gpt-neo-125M",
-    "facebook/opt-125m"
-])
+@pytest.fixture(
+    params=["bigscience/bloom-560m", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-125M", "facebook/opt-125m"])
 def model_name(request):
     return request.param
 
@@ -57,13 +53,11 @@ class save_shard(DistributedFixture):
                 "tensor_parallel": {
                     "tp_size": world_size
                 },
-                "save_mp_checkpoint_path": os.path.join(str(class_tmpdir),
-                                                        model_name),
+                "save_mp_checkpoint_path": os.path.join(str(class_tmpdir), model_name),
             }
 
             # Load model and save sharded checkpoint
-            model = AutoModelForCausalLM.from_pretrained(model_name,
-                                                         torch_dtype=torch.float16)
+            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
             model = deepspeed.init_inference(model, config=inf_config)
 
 
@@ -80,17 +74,14 @@ class TestCheckpointShard(DistributedTest):
             "tensor_parallel": {
                 "tp_size": world_size
             },
-            "checkpoint": os.path.join(class_tmpdir,
-                                       model_name,
-                                       "ds_inference_config.json"),
+            "checkpoint": os.path.join(class_tmpdir, model_name, "ds_inference_config.json"),
         }
 
         # Load model on meta tensors
         model_config = AutoConfig.from_pretrained(model_name)
         # Note that we use half precision to load initially, even for int8
         with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
-            model = AutoModelForCausalLM.from_config(model_config,
-                                                     torch_dtype=torch.bfloat16)
+            model = AutoModelForCausalLM.from_config(model_config, torch_dtype=torch.bfloat16)
         model = model.eval()
         model = deepspeed.init_inference(model, config=inf_config)
         check_dtype(model, dtype)
diff --git a/inference/test_inference.py b/tests/unit/inference/test_inference.py
similarity index 84%
rename from inference/test_inference.py
rename to tests/unit/inference/test_inference.py
index 371ecda710b29e04e42578745e4484e32f01ad15..adff8e074974e0f3263a5a8790345dec74e63238 100644
--- a/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import time
@@ -50,26 +53,17 @@ _gpt_models = [
     "bigscience/bloom-560m",
 ]
 _opt_models = [
-    "facebook/opt-125m",        # 125m, 1.7B, ..., 175B variants have the same model architecture.
-    "facebook/opt-350m",        # 350m applies layer norm after attnention layer which is different than other variants.
+    "facebook/opt-125m",  # 125m, 1.7B, ..., 175B variants have the same model architecture.
+    "facebook/opt-350m",  # 350m applies layer norm after attnention layer which is different than other variants.
 ]
 _all_models = HfApi().list_models()
 
 test_models = set(_bert_models + _roberta_models + _gpt_models + _opt_models)
 test_tasks = [
-    "fill-mask",
-    "question-answering",
-    "text-classification",
-    "token-classification",
-    "text-generation",
-    "text2text-generation",
-    "summarization",
-    "translation"
+    "fill-mask", "question-answering", "text-classification", "token-classification", "text-generation",
+    "text2text-generation", "summarization", "translation"
 ]
-pytest.all_models = {
-    task: [m.modelId for m in _all_models if m.pipeline_tag == task]
-    for task in test_tasks
-}
+pytest.all_models = {task: [m.modelId for m in _all_models if m.pipeline_tag == task] for task in test_tasks}
 
 _model_w_tasks = itertools.product(*[test_models, test_tasks])
 
@@ -116,8 +110,7 @@ def invalid_model_task_config(model_w_task, dtype, enable_cuda_graph):
         msg = f"Not a valid model / task combination: {model} / {task}"
     elif enable_cuda_graph and (torch_info["cuda_version"] == "0.0"):
         msg = "CUDA not detected, cannot use CUDA Graph"
-    elif enable_cuda_graph and pkg_version.parse(
-            torch.__version__) < pkg_version.parse("1.10"):
+    elif enable_cuda_graph and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
         msg = "CUDA Graph is only available in torch versions >= 1.10"
     elif "gpt-j-6B" in model:
         if dtype != torch.half:
@@ -144,16 +137,7 @@ statement for each combination of model /task
 @pytest.fixture
 def query(model_w_task):
     model, task = model_w_task
-    angle_bracket_mask_models = [
-        "roberta",
-        "camembert",
-        "esm",
-        "ibert",
-        "luke",
-        "mpnet",
-        "yoso",
-        "mpnet"
-    ]
+    angle_bracket_mask_models = ["roberta", "camembert", "esm", "ibert", "luke", "mpnet", "yoso", "mpnet"]
 
     if task == "fill-mask":
         if any(map(lambda x: x in model, angle_bracket_mask_models)):
@@ -208,18 +192,15 @@ def token_classification_assert(x, y):
 
 
 def text_generation_assert(x, y):
-    return set(res["generated_text"] for res in x) == set(res["generated_text"]
-                                                          for res in y)
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
 
 
 def text2text_generation_assert(x, y):
-    return set(res["generated_text"] for res in x) == set(res["generated_text"]
-                                                          for res in y)
+    return set(res["generated_text"] for res in x) == set(res["generated_text"] for res in y)
 
 
 def translation_assert(x, y):
-    return set(res["translation_text"] for res in x) == set(res["translation_text"]
-                                                            for res in y)
+    return set(res["translation_text"] for res in x) == set(res["translation_text"] for res in y)
 
 
 def summarization_assert(x, y):
@@ -246,6 +227,7 @@ def assert_fn(model_w_task):
 
 
 def check_injection(model):
+
     def verify_injection(module):
         for child in module.children():
             if isinstance(child, nn.ModuleList):
@@ -331,19 +313,11 @@ class TestModelTask(DistributedTest):
 
 
 @pytest.mark.seq_inference
-@pytest.mark.parametrize("model_w_task",
-                         [("EleutherAI/gpt-neo-1.3B",
-                           "text-generation"),
-                          ("EleutherAI/gpt-neox-20b",
-                           "text-generation"),
-                          ("bigscience/bloom-3b",
-                           "text-generation"),
-                          ("EleutherAI/gpt-j-6B",
-                           "text-generation")],
-                         ids=["gpt-neo",
-                              "gpt-neox",
-                              "bloom",
-                              "gpt-j"])
+@pytest.mark.parametrize("model_w_task", [("EleutherAI/gpt-neo-1.3B", "text-generation"),
+                                          ("EleutherAI/gpt-neox-20b", "text-generation"),
+                                          ("bigscience/bloom-3b", "text-generation"),
+                                          ("EleutherAI/gpt-j-6B", "text-generation")],
+                         ids=["gpt-neo", "gpt-neox", "bloom", "gpt-j"])
 class TestMPSize(DistributedTest):
     world_size = 4
 
@@ -385,21 +359,14 @@ class TestMPSize(DistributedTest):
 @pytest.mark.parametrize(
     "model_w_task, injection_policy",
     [
-        (("google/t5-v1_1-small",
-          "text2text-generation"),
-         {
-             T5Block: ('SelfAttention.o',
-                       'EncDecAttention.o',
-                       'DenseReluDense.wo')
-         }),
-        (("roberta-large",
-          "fill-mask"),
-         {
-             RobertaLayer: ('output.dense')
-         }),
+        (("google/t5-v1_1-small", "text2text-generation"), {
+            T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')
+        }),
+        (("roberta-large", "fill-mask"), {
+            RobertaLayer: ('output.dense')
+        }),
     ],
-    ids=["t5",
-         "roberta"],
+    ids=["t5", "roberta"],
 )
 @pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"])
 @pytest.mark.parametrize("enable_cuda_graph", [False], ids=["noCG"])
@@ -446,8 +413,7 @@ class TestInjectionPolicy(DistributedTest):
 @pytest.mark.parametrize(
     "model_w_task",
     [
-        ("Helsinki-NLP/opus-mt-en-de",
-         "translation"),
+        ("Helsinki-NLP/opus-mt-en-de", "translation"),
     ],
     ids=[
         "marian",
@@ -480,9 +446,7 @@ class TestAutoTensorParallelism(DistributedTest):
         pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
         bs_output = pipe(query, **inf_kwargs)
 
-        pipe.model = deepspeed.init_inference(pipe.model,
-                                              mp_size=world_size,
-                                              dtype=dtype)
+        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
         # Switch device to GPU so that input tensors are not on CPU
         pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
@@ -496,12 +460,9 @@ class TestAutoTensorParallelism(DistributedTest):
 @pytest.mark.parametrize(
     "model_family, model_name",
     (
-        ["gpt2",
-         "EleutherAI/gpt-neo-2.7B"],
-        ["gpt2",
-         "EleutherAI/gpt-j-6B"],
-        ["gpt2",
-         "gpt2-xl"],
+        ["gpt2", "EleutherAI/gpt-neo-2.7B"],
+        ["gpt2", "EleutherAI/gpt-j-6B"],
+        ["gpt2", "gpt2-xl"],
     ),
 )
 @pytest.mark.parametrize("task", ["lambada_standard"])
@@ -522,15 +483,13 @@ class TestLMCorrectness(DistributedTest):
 
         if 'gpt-j-6B' in model_name:
             dtype = torch.half
-            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}",
-                {"device": "cpu"})
+            lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                               {"device": "cpu"})
             setattr(lm, model_family, getattr(lm, model_family).half().to(device))
             lm._device = device
         else:
             lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}",
-                {"device": get_accelerator().device_name()})
+                f"pretrained={model_name}", {"device": get_accelerator().device_name()})
 
         get_accelerator().synchronize()
         start = time.time()
@@ -539,8 +498,7 @@ class TestLMCorrectness(DistributedTest):
         bs_time = time.time() - start
 
         ds_model = deepspeed.init_inference(
-            getattr(lm,
-                    model_family),
+            getattr(lm, model_family),
             mp_size=1,
             dtype=dtype,
             replace_with_kernel_inject=True,
@@ -554,7 +512,6 @@ class TestLMCorrectness(DistributedTest):
         get_accelerator().synchronize()
         ds_time = time.time() - start
 
-        ppl_diff = abs(bs_output["results"][task]["ppl"] -
-                       ds_output["results"][task]["ppl"])
+        ppl_diff = abs(bs_output["results"][task]["ppl"] - ds_output["results"][task]["ppl"])
         #assert ds_time <= bs_time
         assert ppl_diff < 0.01
diff --git a/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
similarity index 93%
rename from inference/test_inference_config.py
rename to tests/unit/inference/test_inference_config.py
index e19f73ea35d6e4dea973255c90e76241a957f2f5..375563abf65baa055b2b9456acb1f098abb533c6 100644
--- a/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
diff --git a/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py
similarity index 72%
rename from inference/test_model_profiling.py
rename to tests/unit/inference/test_model_profiling.py
index 07ce839306a6449be4c5b2953b99ecf99da42bb5..626bfd11f2cc8d73d573462772acafab270a9e27 100644
--- a/inference/test_model_profiling.py
+++ b/tests/unit/inference/test_model_profiling.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import time
@@ -32,32 +35,19 @@ def inf_kwargs(task):
 
 
 @pytest.mark.inference
-@pytest.mark.parametrize("model,task",
-                         [
-                             ("bert-base-cased",
-                              "fill-mask"),
-                             ("roberta-base",
-                              "fill-mask"),
-                             ("gpt2",
-                              "text-generation"),
-                             ("facebook/opt-125m",
-                              "text-generation"),
-                             ("bigscience/bloom-560m",
-                              "text-generation"),
-                         ])
+@pytest.mark.parametrize("model,task", [
+    ("bert-base-cased", "fill-mask"),
+    ("roberta-base", "fill-mask"),
+    ("gpt2", "text-generation"),
+    ("facebook/opt-125m", "text-generation"),
+    ("bigscience/bloom-560m", "text-generation"),
+])
 @pytest.mark.parametrize("cuda_graphs", [True, False])
 @pytest.mark.parametrize("use_cuda_events", [True, False])
 class TestModelProfiling(DistributedTest):
     world_size = 1
 
-    def test(self,
-             model,
-             task,
-             query,
-             inf_kwargs,
-             cuda_graphs,
-             use_cuda_events,
-             dtype=torch.float16):
+    def test(self, model, task, query, inf_kwargs, cuda_graphs, use_cuda_events, dtype=torch.float16):
         if cuda_graphs and "bert" not in model:
             pytest.skip(f"CUDA Graph not supported for {model}")
 
diff --git a/tests/unit/launcher/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py
index 9d7af74f2c57706d547c478f7e1e4ea2e10dc6ee..7155beebc9023e5ae41459bd43207243b327cbed 100644
--- a/tests/unit/launcher/test_ds_arguments.py
+++ b/tests/unit/launcher/test_ds_arguments.py
@@ -1,8 +1,12 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import argparse
 import pytest
 import deepspeed
+from deepspeed.launcher.launch import parse_range_list
 
 
 def basic_parser():
@@ -82,12 +86,7 @@ def test_no_ds_parser():
 def test_core_deepscale_arguments():
     parser = basic_parser()
     parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(
-        ['--num_epochs',
-         '2',
-         '--deepspeed',
-         '--deepspeed_config',
-         'foo.json'])
+    args = parser.parse_args(['--num_epochs', '2', '--deepspeed', '--deepspeed_config', 'foo.json'])
     assert args
 
     assert hasattr(args, 'num_epochs')
@@ -100,3 +99,35 @@ def test_core_deepscale_arguments():
     assert hasattr(args, 'deepspeed_config')
     assert type(args.deepspeed_config) == str
     assert args.deepspeed_config == 'foo.json'
+
+
+def test_core_binding_arguments():
+    core_list = parse_range_list("0,2-4,6,8-9")
+    assert core_list == [0, 2, 3, 4, 6, 8, 9]
+
+    try:
+        # negative case for range overlapping
+        core_list = parse_range_list("0,2-6,5-9")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
+
+    try:
+        # negative case for reverse order -- case 1
+        core_list = parse_range_list("8,2-6")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
+
+    try:
+        # negative case for reverse order -- case 2
+        core_list = parse_range_list("1,6-2")
+    except ValueError as e:
+        pass
+    else:
+        # invalid core list must fail
+        assert False
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
index 27233d7150dcadf6720f8b93aa9afe140dfcedce..743fffd8426fdcddc23520d7dc6bbbf5d750eca0 100644
--- a/tests/unit/launcher/test_multinode_runner.py
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from copy import deepcopy
 from deepspeed.launcher import multinode_runner as mnrunner
diff --git a/tests/unit/launcher/test_run.py b/tests/unit/launcher/test_run.py
index 1d7f4efc6815e83d513034ed86cd2e5ad349b15e..6540ebcf598ca836e0bda37dc9649535de0cfd39 100644
--- a/tests/unit/launcher/test_run.py
+++ b/tests/unit/launcher/test_run.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
diff --git a/tests/unit/megatron_model.py b/tests/unit/megatron_model.py
index 32faf224494027dd1f58fbe2ec6273d072e26657..011ebaf4d3b976f937234784ddad82e9a91a9957 100644
--- a/tests/unit/megatron_model.py
+++ b/tests/unit/megatron_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -31,12 +34,7 @@ def get_gpt2_model(args_others, mp_size=1):
     args_defaults.update(args_others)
 
     # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
-    sys.argv.extend([
-        '--model-parallel-size',
-        str(mp_size),
-        '--make-vocab-size-divisible-by',
-        str(1)
-    ])
+    sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
 
     initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
     model = GPT2Model(num_tokentypes=0, parallel_output=False)
@@ -44,15 +42,13 @@ def get_gpt2_model(args_others, mp_size=1):
     from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
     from megatron import mpu
     i = get_accelerator().current_device_name()
-    model = torchDDP(model,
-                     device_ids=[i],
-                     output_device=i,
-                     process_group=mpu.get_data_parallel_group())
+    model = torchDDP(model, device_ids=[i], output_device=i, process_group=mpu.get_data_parallel_group())
 
     return model
 
 
 class MockGPT2ModelPipe(PipelineModule):
+
     def __init__(self, num_layers, mp_size, args_others, topo, **kwargs):
         from megatron.initialize import initialize_megatron
 
@@ -65,38 +61,25 @@ class MockGPT2ModelPipe(PipelineModule):
         args_defaults.update(args_others)
 
         # setting "make-vocab-size-divisible-by" to avoid word-embedding size change in resizing testing.
-        sys.argv.extend([
-            '--model-parallel-size',
-            str(mp_size),
-            '--make-vocab-size-divisible-by',
-            str(1)
-        ])
+        sys.argv.extend(['--model-parallel-size', str(mp_size), '--make-vocab-size-divisible-by', str(1)])
 
         initialize_megatron(args_defaults=args_defaults, ignore_unknown_args=True)
 
         from megatron.model.transformer import ParallelTransformerLayer
 
         class ParallelTransformerLayerPipe(ParallelTransformerLayer):
+
             def forward(self, args):
                 # hardcode attn mask for testing, PP requires the attn_mask to be stashed
-                attention_mask = torch.tensor(
-                    [[True]],
-                    device=get_accelerator().current_device_name())
+                attention_mask = torch.tensor([[True]], device=get_accelerator().current_device_name())
                 return super().forward(args, attention_mask)
 
         layers = []
         for x in range(num_layers):
             layers.append(
-                LayerSpec(ParallelTransformerLayerPipe,
-                          self.gpt2_attention_mask_func,
-                          self.init_method_normal(0.02),
-                          self.scaled_init_method_normal(0.02,
-                                                         num_layers),
-                          x))
-        super().__init__(layers=layers,
-                         loss_fn=torch.nn.CrossEntropyLoss(),
-                         topology=topo,
-                         **kwargs)
+                LayerSpec(ParallelTransformerLayerPipe, self.gpt2_attention_mask_func, self.init_method_normal(0.02),
+                          self.scaled_init_method_normal(0.02, num_layers), x))
+        super().__init__(layers=layers, loss_fn=torch.nn.CrossEntropyLoss(), topology=topo, **kwargs)
 
     def gpt2_attention_mask_func(self, attention_scores, ltor_mask):
         attention_scores.masked_fill_(ltor_mask, -10000.0)
@@ -104,6 +87,7 @@ class MockGPT2ModelPipe(PipelineModule):
 
     def init_method_normal(self, sigma):
         """Init method based on N(0, sigma)."""
+
         def init_(tensor):
             return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
 
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index d17f45c0b526afc11042936beee5ce90021a550f..7fa9d849a746b4fe7fdba792c85d8e199b6f0170 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -10,12 +13,12 @@ import deepspeed.comm as dist
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest, DistributedFixture
 from unit.megatron_model import get_gpt2_model, get_megatron_version
+from unit.util import required_minimum_torch_version, required_maximum_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=5),
+                                reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_maximum_torch_version(major_version=1, minor_version=13),
+                                reason='Megatron-LM package requires Pytorch version 1.13 or below')
 
 
 def get_deepspeed_model(model):
@@ -30,14 +33,15 @@ def get_deepspeed_model(model):
     }
 
     from megatron import mpu
-    model, _, _,_ = deepspeed.initialize(model=model,
-                                         mpu=mpu,
-                                         model_parameters=model.parameters(),
-                                         config=ds_config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model,
+                                          mpu=mpu,
+                                          model_parameters=model.parameters(),
+                                          config=ds_config_dict)
     return model
 
 
 class ConfigurableMP(DistributedTest):
+
     @pytest.fixture(autouse=True)
     def reset_random(self, seed=1234):
         random.seed(seed)
@@ -49,16 +53,14 @@ class ConfigurableMP(DistributedTest):
     def inputs(self, bs=1, seq_len=20):
         input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len))
         position_ids = torch.randint(low=0, high=2, size=(bs, seq_len))
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
+        attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool)
         return [input_ids, position_ids, attention_mask]
 
 
 class TestConfigurableMP(ConfigurableMP):
+
     @pytest.mark.world_size(1)
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_gpt2_basic(self, tmpdir, inputs):
         args_defaults = {
             'num_layers': 2,
@@ -72,24 +74,21 @@ class TestConfigurableMP(ConfigurableMP):
 
         model.eval()
         device_name = get_accelerator().device_name()
-        baseline = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+        baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
 
         tag = 'mp_1'
         state_dict = {}
         state_dict['checkpoint_version'] = get_megatron_version()
         model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
 
         test = model(inputs[0], inputs[1], inputs[2])
-        assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+        assert torch.allclose(baseline, test,
+                              atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
     @pytest.mark.world_size(2)
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_gpt2_mp2_no_resize(self, tmpdir, inputs):
         args_defaults = {
             'num_layers': 2,
@@ -104,25 +103,19 @@ class TestConfigurableMP(ConfigurableMP):
         model.eval()
 
         device_name = get_accelerator().device_name()
-        baseline = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+        baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
 
         tag = 'mp_2'
         state_dict = {}
         state_dict['checkpoint_version'] = get_megatron_version()
         model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
 
         device_name = get_accelerator().device_name()
-        test = model(inputs[0].to(device_name),
-                     inputs[1].to(device_name),
-                     inputs[2].to(device_name))
-        assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+        test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
+        assert torch.allclose(baseline, test, rtol=1.0,
+                              atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
 
 # This fixture provides the baseline model with mp=2 to TestConfigurableMPResize
@@ -144,9 +137,7 @@ class baseline_mp2(DistributedFixture):
 
         with torch.no_grad():
             device_name = get_accelerator().device_name()
-            baseline = model(inputs[0].to(device_name),
-                             inputs[1].to(device_name),
-                             inputs[2].to(device_name))
+            baseline = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 save_path = os.path.join(class_tmpdir, "output.pt")
                 torch.save(baseline.cpu(), save_path)
@@ -159,6 +150,7 @@ class baseline_mp2(DistributedFixture):
 class TestConfigurableResizeMP(ConfigurableMP):
     world_size = [1, 4]
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test(self, baseline_mp2, inputs, class_tmpdir):
         args_defaults = {
             'num_layers': 2,
@@ -174,15 +166,13 @@ class TestConfigurableResizeMP(ConfigurableMP):
         model.eval()
 
         with torch.no_grad():
-            model.load_checkpoint(class_tmpdir,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=False)
+            model.load_checkpoint(class_tmpdir, load_optimizer_states=False, load_lr_scheduler_states=False)
             device_name = get_accelerator().device_name()
-            test = model(inputs[0].to(device_name),
-                         inputs[1].to(device_name),
-                         inputs[2].to(device_name))
+            test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
                 baseline = torch.load(load_path)
                 test = test.cpu()
-                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                assert torch.allclose(
+                    baseline, test,
+                    atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index af091d68c411a47cd915577b06f4dda5dd37b461..c059a0563d9b846d0432c3ec2f8c8223dced8a3c 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import torch
@@ -12,12 +15,12 @@ from unit.megatron_model import get_megatron_version
 from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
 from deepspeed.utils import RepeatingLoader
 from deepspeed.accelerator import get_accelerator
+from unit.util import required_minimum_torch_version, required_maximum_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=5),
+                                reason='Megatron-LM package requires Pytorch version 1.5 or above')
+pytestmark = pytest.mark.skipif(not required_maximum_torch_version(major_version=1, minor_version=13),
+                                reason='Megatron-LM package requires Pytorch version 1.13 or below')
 
 
 def get_deepspeed_model(model):
@@ -31,9 +34,7 @@ def get_deepspeed_model(model):
         },
     }
 
-    model, _, _,_ = deepspeed.initialize(model=model,
-                                         model_parameters=model.parameters(),
-                                         config=ds_config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=ds_config_dict)
     return model.to(get_accelerator().device_name())
 
 
@@ -48,6 +49,7 @@ def get_topology(mp, pp, world_size):
 
 
 class ConfigurablePP(DistributedTest):
+
     @pytest.fixture(autouse=True)
     def reset_random(self, seed=1234):
         random.seed(seed)
@@ -58,11 +60,7 @@ class ConfigurablePP(DistributedTest):
     @pytest.fixture
     def inputs(self, bs=1, seq_len=1, hidden_size=128):
         hidden_states = torch.randn(bs, seq_len, hidden_size)
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
+        attention_mask = torch.randint(low=0, high=2, size=(bs, seq_len), dtype=torch.bool)
         return (hidden_states, attention_mask)
 
 
@@ -71,6 +69,7 @@ class TestConfigurablePP(ConfigurablePP):
     pp_size = 2
     world_size = 4  # mp_size * pp_size
 
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
     def test_pp_basic(self, inputs, tmpdir):
         # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading.
         args_defaults = {
@@ -102,20 +101,13 @@ class TestConfigurablePP(ConfigurablePP):
         else:
             data_iter = None
 
-        baseline = model.eval_batch(data_iter=data_iter,
-                                    compute_loss=False,
-                                    reduce_output=None)
+        baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
         dist.barrier()
-        model.load_checkpoint(tmpdir,
-                              tag=tag,
-                              load_optimizer_states=False,
-                              load_lr_scheduler_states=False)
+        model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False)
         dist.barrier()
 
-        test = model.eval_batch(data_iter=data_iter,
-                                compute_loss=False,
-                                reduce_output=None)
+        test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
         if test is not None:
             assert len(baseline) == len(test)
@@ -123,7 +115,9 @@ class TestConfigurablePP(ConfigurablePP):
             for mb in range(len(baseline)):
                 for b, t in zip(baseline[mb], test[mb]):
                     if b.is_floating_point():  # don't compare masks
-                        assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                        assert torch.allclose(
+                            b, t,
+                            atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
 
 # Fixture for defining the checkpoint path since all tests in
@@ -139,7 +133,8 @@ class _baseline(DistributedFixture):
     world_size = None
 
     def run(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size):
-        assert int(os.environ["WORLD_SIZE"]) == (pp_size * mp_size), "world size does not match provided pp_size and mp_size"
+        assert int(os.environ["WORLD_SIZE"]) == (pp_size *
+                                                 mp_size), "world size does not match provided pp_size and mp_size"
         args_defaults = {
             'num_layers': 8,
             'hidden_size': 128,
@@ -163,9 +158,7 @@ class _baseline(DistributedFixture):
             else:
                 data_iter = None
 
-            baseline = model.eval_batch(data_iter=data_iter,
-                                        compute_loss=False,
-                                        reduce_output=None)
+            baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
             if baseline is not None:
                 # baseline should be [[hidden, True]]]
@@ -177,9 +170,7 @@ class _baseline(DistributedFixture):
 
             state_dict = {}
             state_dict['checkpoint_version'] = get_megatron_version()
-            model.save_checkpoint(class_tmpdir,
-                                  tag=checkpoint_tag,
-                                  client_state=state_dict)
+            model.save_checkpoint(class_tmpdir, tag=checkpoint_tag, client_state=state_dict)
 
 
 # This may look odd, but there is a limitation with DistributedFixture that
@@ -198,14 +189,8 @@ class baseline_ws4(_baseline):
 
 
 class TestConfigurableResizePP(ConfigurablePP):
-    def _test(self,
-              inputs,
-              class_tmpdir,
-              checkpoint_tag,
-              mp_size,
-              pp_size,
-              mp_resize,
-              pp_resize):
+
+    def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize):
         args_defaults = {
             'num_layers': 8,
             'hidden_size': 128,
@@ -233,9 +218,7 @@ class TestConfigurableResizePP(ConfigurablePP):
             else:
                 data_iter = None
 
-            test = model.eval_batch(data_iter=data_iter,
-                                    compute_loss=False,
-                                    reduce_output=None)
+            test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None)
 
             if test is not None:
                 # test should be [[hidden, True]]]
@@ -245,108 +228,42 @@ class TestConfigurableResizePP(ConfigurablePP):
                 test = test[0][0].cpu()
                 load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
                 baseline = torch.load(load_path)
-                assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+                assert torch.allclose(
+                    baseline, test,
+                    atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
 
     # These tests are divided by baseline model worldsize and test model worldsize
     @pytest.mark.world_size(1)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 1)])
-    def test_world_size_2to1(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws2,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_2to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(1)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 1, 1)])
-    def test_world_size_4to1(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws4,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_4to1(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(2)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(2, 2, 2, 1)])
-    def test_world_size_4to2(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws4,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_4to2(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws4, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(4)
     @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 1, 2, 2)])
-    def test_world_size_1to4(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws1,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_1to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws1, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
 
     @pytest.mark.world_size(4)
-    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize",
-                             [(1,
-                               2,
-                               1,
-                               4),
-                              (2,
-                               1,
-                               2,
-                               2)])
-    def test_world_size_2to4(self,
-                             inputs,
-                             class_tmpdir,
-                             checkpoint_tag,
-                             baseline_ws2,
-                             mp_size,
-                             pp_size,
-                             mp_resize,
+    @pytest.mark.parametrize("mp_size, pp_size, mp_resize, pp_resize", [(1, 2, 1, 4), (2, 1, 2, 2)])
+    @pytest.mark.skip(reason="megatron-lm is currently broken so this test cannot be run.")
+    def test_world_size_2to4(self, inputs, class_tmpdir, checkpoint_tag, baseline_ws2, mp_size, pp_size, mp_resize,
                              pp_resize):
-        self._test(inputs,
-                   class_tmpdir,
-                   checkpoint_tag,
-                   mp_size,
-                   pp_size,
-                   mp_resize,
-                   pp_resize)
+        self._test(inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resize, pp_resize)
diff --git a/tests/unit/modeling.py b/tests/unit/modeling.py
index 50846b53265638eadacb22dbed2564c422a3b89e..7930fdafe541a47a09324bcde96e093bf4acfc2c 100644
--- a/tests/unit/modeling.py
+++ b/tests/unit/modeling.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import absolute_import, division, print_function, unicode_literals
 # Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
@@ -48,20 +53,15 @@ from deepspeed.accelerator import get_accelerator
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
     'bert-base-multilingual-uncased':
     "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
     'bert-base-multilingual-cased':
     "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
 }
 CONFIG_NAME = 'bert_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
@@ -76,9 +76,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     print("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -183,6 +182,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 class GPUTimer:
+
     def __init__(self):
         super().__init__()
         self.start = get_accelerator().Event()  # noqa: F821
@@ -202,13 +202,7 @@ class LinearActivation(Module):
     """
     __constants__ = ['bias']
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weights,
-                 biases,
-                 act='gelu',
-                 bias=True):
+    def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True):
         super(LinearActivation, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -256,15 +250,14 @@ class LinearActivation(Module):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
 
 
 class BertConfig(object):
     """Configuration class to store the configuration of a `BertModel`.
     """
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  hidden_size=768,
@@ -360,15 +353,11 @@ try:
     import apex.normalization
     #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
     BertLayerNorm = apex.normalization.FusedLayerNorm
-#aiss debug
-    #from torch.nn.modules import LayerNorm as BertLayerNorm
-
 except ImportError:
-    print(
-        "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
-    )
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 
     class BertLayerNorm(nn.Module):
+
         def __init__(self, hidden_size, eps=1e-12):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
@@ -387,13 +376,12 @@ except ImportError:
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
@@ -402,9 +390,7 @@ class BertEmbeddings(nn.Module):
 
     def forward(self, input_ids, token_type_ids=None):
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length,
-                                    dtype=torch.long,
-                                    device=input_ids.device)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
@@ -420,13 +406,12 @@ class BertEmbeddings(nn.Module):
 
 
 class BertSelfAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -454,14 +439,12 @@ class BertSelfAttention(nn.Module):
         #self.softmax = DeepSpeedSoftmax(i, self.softmax_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def transpose_key_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 3, 1)
 
@@ -497,6 +480,7 @@ class BertSelfAttention(nn.Module):
 
 
 class BertSelfOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -516,6 +500,7 @@ class BertSelfOutput(nn.Module):
 
 
 class BertAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(i, config, weights, biases)
@@ -531,6 +516,7 @@ class BertAttention(nn.Module):
 
 
 class BertIntermediate(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertIntermediate, self).__init__()
         self.dense_act = LinearActivation(config.hidden_size,
@@ -545,6 +531,7 @@ class BertIntermediate(nn.Module):
 
 
 class BertOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -561,6 +548,7 @@ class BertOutput(nn.Module):
 
 
 class BertLayer(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(i, config, weights, biases)
@@ -583,26 +571,14 @@ class BertLayer(nn.Module):
             self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"]))
             self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"]))
-            self.attention.output.LayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_W"]))
-            self.attention.output.LayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_B"]))
+            self.attention.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"]))
+            self.attention.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"]))
             self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"]))
             self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"]))
             self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"]))
             self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"]))
-            self.output.LayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_W"]))
-            self.output.LayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_B"]))
+            self.output.LayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"]))
+            self.output.LayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"]))
 
         return layer_output
 
@@ -611,17 +587,14 @@ class BertLayer(nn.Module):
 
 
 class BertEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertEncoder, self).__init__()
         #layer = BertLayer(config, weights, biases)
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
-        self.layer = nn.ModuleList([
-            copy.deepcopy(BertLayer(i,
-                                    config,
-                                    weights,
-                                    biases)) for i in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)])
         self.grads = []
         self.graph = []
 
@@ -643,14 +616,11 @@ class BertEncoder(nn.Module):
             self.graph.append(mdl)
             self.get_modules(self, mdl, input)
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -665,23 +635,13 @@ class BertEncoder(nn.Module):
             num_layers = len(self.layer)
             chunk_length = math.ceil(math.sqrt(num_layers))
             while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
+                hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1)
                 l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             self.grads,
-                                             collect_all_grads=True)
-                hidden_states.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True)
+                hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"]))
                 #print("pytorch weight is: ", layer_module.get_w())
 
                 if output_all_encoded_layers:
@@ -710,11 +670,10 @@ class BertEncoder(nn.Module):
 
 
 class BertPooler(nn.Module):
+
     def __init__(self, config):
         super(BertPooler, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act="tanh")
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -725,11 +684,10 @@ class BertPooler(nn.Module):
 
 
 class BertPredictionHeadTransform(nn.Module):
+
     def __init__(self, config):
         super(BertPredictionHeadTransform, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act=config.hidden_act)
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
     def forward(self, hidden_states):
@@ -739,6 +697,7 @@ class BertPredictionHeadTransform(nn.Module):
 
 
 class BertLMPredictionHead(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertLMPredictionHead, self).__init__()
         self.transform = BertPredictionHeadTransform(config)
@@ -753,16 +712,15 @@ class BertLMPredictionHead(nn.Module):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        get_accelerator().range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(),
-                self.decoder.weight.size()))
+        get_accelerator().range_push("decoder input.size() = {}, weight.size() = {}".format(
+            hidden_states.size(), self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
         get_accelerator().range_pop()
         return hidden_states
 
 
 class BertOnlyMLMHead(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertOnlyMLMHead, self).__init__()
         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
@@ -773,6 +731,7 @@ class BertOnlyMLMHead(nn.Module):
 
 
 class BertOnlyNSPHead(nn.Module):
+
     def __init__(self, config):
         super(BertOnlyNSPHead, self).__init__()
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
@@ -783,6 +742,7 @@ class BertOnlyNSPHead(nn.Module):
 
 
 class BertPreTrainingHeads(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertPreTrainingHeads, self).__init__()
         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
@@ -798,15 +758,14 @@ class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__,
-                    self.__class__.__name__))
+            raise ValueError("Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                             "To create a model from a Google pretrained model use "
+                             "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                                 self.__class__.__name__, self.__class__.__name__))
         self.config = config
 
     def init_bert_weights(self, module):
@@ -863,9 +822,8 @@ class BertPreTrainedModel(nn.Module):
         if resolved_archive_file == archive_file:  # noqa: F821
             logger.info("loading archive file {}".format(archive_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file,
-                resolved_archive_file))  # noqa: F821
+            logger.info("loading archive file {} from cache at {}".format(archive_file,
+                                                                          resolved_archive_file))  # noqa: F821
         tempdir = None
         if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
             serialization_dir = resolved_archive_file  # noqa: F821
@@ -886,9 +844,7 @@ class BertPreTrainedModel(nn.Module):
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(
-                weights_path,
-                map_location='cpu' if not get_accelerator().is_available() else None)
+            state_dict = torch.load(weights_path, map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
@@ -922,34 +878,25 @@ class BertPreTrainedModel(nn.Module):
 
         def load(module, prefix=''):
             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(state_dict,
-                                         prefix,
-                                         local_metadata,
-                                         True,
-                                         missing_keys,
-                                         unexpected_keys,
+            module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys,
                                          error_msgs)
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
 
         start_prefix = ''
-        if not hasattr(model,
-                       'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__,
-                missing_keys))
+                model.__class__.__name__, missing_keys))
         if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__,
-                unexpected_keys))
+            logger.info("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__,
+                                                                                  unexpected_keys))
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                model.__class__.__name__,
-                "\n\t".join(error_msgs)))
+                model.__class__.__name__, "\n\t".join(error_msgs)))
         return model
 
 
@@ -997,6 +944,7 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertModel, self).__init__(config)
         self.embeddings = BertEmbeddings(config)
@@ -1027,16 +975,14 @@ class BertModel(BertPreTrainedModel):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(
-            self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-            output_all_encoded_layers=output_all_encoded_layers,
-            checkpoint_activations=checkpoint_activations)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      checkpoint_activations=checkpoint_activations)
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
@@ -1094,6 +1040,7 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, args):
         super(BertForPreTraining, self).__init__(config)
         self.summary_writer = None
@@ -1102,17 +1049,14 @@ class BertForPreTraining(BertPreTrainedModel):
         self.samples_per_step = dist.get_world_size() * args.train_batch_size
         self.sample_count = self.samples_per_step
         self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config,
-                                        self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
     def log_summary_writer(self, logs: dict, base='Train'):
         if dist.get_rank() == 0:
             module_name = "Samples"  #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type))
             for key, log in logs.items():
-                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}',
-                                               log,
-                                               self.sample_count)
+                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', log, self.sample_count)
             self.sample_count += self.samples_per_step
 
     def forward(self, batch, log=True):
@@ -1124,18 +1068,17 @@ class BertForPreTraining(BertPreTrainedModel):
         next_sentence_label = batch[4]
         checkpoint_activations = False
 
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
+        sequence_output, pooled_output = self.bert(input_ids,
+                                                   token_type_ids,
+                                                   attention_mask,
+                                                   output_all_encoded_layers=False,
+                                                   checkpoint_activations=checkpoint_activations)
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
             total_loss = masked_lm_loss + next_sentence_loss
             #            if log:
@@ -1187,6 +1130,7 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
         self.bert = BertModel(config)
@@ -1199,15 +1143,12 @@ class BertForMaskedLM(BertPreTrainedModel):
                 attention_mask=None,
                 masked_lm_labels=None,
                 checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         prediction_scores = self.cls(sequence_output)
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             return masked_lm_loss
         else:
             return prediction_scores
@@ -1256,6 +1197,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
         self.bert = BertModel(config)
@@ -1268,15 +1210,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
                 attention_mask=None,
                 next_sentence_label=None,
                 checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             return next_sentence_loss
         else:
             return seq_relationship_score
@@ -1327,6 +1266,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = num_labels
@@ -1335,12 +1275,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
@@ -1397,6 +1332,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_choices):
         super(BertForMultipleChoice, self).__init__(config)
         self.num_choices = num_choices
@@ -1405,16 +1341,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        _, pooled_output = self.bert(flat_input_ids,
+                                     flat_token_type_ids,
+                                     flat_attention_mask,
+                                     output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
@@ -1472,6 +1406,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_labels):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = num_labels
@@ -1480,12 +1415,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -1552,6 +1482,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
         self.bert = BertModel(config)
diff --git a/tests/unit/modelingpreln.py b/tests/unit/modelingpreln.py
index 0069add9aa4d8aa115ae244cd28896ee7b9cee5e..7058c1a744fd3ec33eb4a22cc7d138bb43440a2b 100644
--- a/tests/unit/modelingpreln.py
+++ b/tests/unit/modelingpreln.py
@@ -1,3 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 from __future__ import absolute_import, division, print_function, unicode_literals
 # Copyright The Microsoft DeepSpeed Team
 # DeepSpeed note, code taken from commit 3d59216cec89a363649b4fe3d15295ba936ced0f
@@ -48,20 +53,15 @@ from deepspeed.accelerator import get_accelerator
 logger = logging.getLogger(__name__)
 
 PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'bert-base-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
     'bert-base-multilingual-uncased':
     "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
     'bert-base-multilingual-cased':
     "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese':
-    "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
 }
 CONFIG_NAME = 'bert_config.json'
 WEIGHTS_NAME = 'pytorch_model.bin'
@@ -76,9 +76,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         import numpy as np
         import tensorflow as tf
     except ImportError:
-        print(
-            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
         raise
     tf_path = os.path.abspath(tf_checkpoint_path)
     print("Converting TensorFlow checkpoint from {}".format(tf_path))
@@ -183,6 +182,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 class GPUTimer:
+
     def __init__(self):
         super().__init__()
         self.start = get_accelerator().Event()  # noqa: F821
@@ -202,13 +202,7 @@ class LinearActivation(Module):
     """
     __constants__ = ['bias']
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weights,
-                 biases,
-                 act='gelu',
-                 bias=True):
+    def __init__(self, in_features, out_features, weights, biases, act='gelu', bias=True):
         super(LinearActivation, self).__init__()
         self.in_features = in_features
         self.out_features = out_features
@@ -256,15 +250,14 @@ class LinearActivation(Module):
             return self.act_fn(F.linear(input, self.weight, self.bias))
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features,
-            self.out_features,
-            self.bias is not None)
+        return 'in_features={}, out_features={}, bias={}'.format(self.in_features, self.out_features, self.bias
+                                                                 is not None)
 
 
 class BertConfig(object):
     """Configuration class to store the configuration of a `BertModel`.
     """
+
     def __init__(self,
                  vocab_size_or_config_json_file,
                  hidden_size=768,
@@ -361,11 +354,10 @@ try:
     #apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
     BertLayerNorm = apex.normalization.FusedLayerNorm
 except ImportError:
-    print(
-        "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex."
-    )
+    print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 
     class BertLayerNorm(nn.Module):
+
         def __init__(self, hidden_size, eps=1e-12):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
@@ -392,13 +384,12 @@ except ImportError:
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
+
     def __init__(self, config):
         super(BertEmbeddings, self).__init__()
         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
-                                                config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
-                                                  config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
@@ -407,9 +398,7 @@ class BertEmbeddings(nn.Module):
 
     def forward(self, input_ids, token_type_ids=None):
         seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length,
-                                    dtype=torch.long,
-                                    device=input_ids.device)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
@@ -425,13 +414,12 @@ class BertEmbeddings(nn.Module):
 
 
 class BertSelfAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertSelfAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size,
-                                config.num_attention_heads))
+            raise ValueError("The hidden size (%d) is not a multiple of the number of attention "
+                             "heads (%d)" % (config.hidden_size, config.num_attention_heads))
         self.num_attention_heads = config.num_attention_heads
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
@@ -459,14 +447,12 @@ class BertSelfAttention(nn.Module):
         #self.softmax = DeepSpeedSoftmax(i, self.softmax_config)
 
     def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
     def transpose_key_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
-                                       self.attention_head_size)
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 3, 1)
 
@@ -559,6 +545,7 @@ class BertSelfAttention(nn.Module):
 
 
 class BertSelfOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -586,6 +573,7 @@ class BertSelfOutput(nn.Module):
 
 
 class BertAttention(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertAttention, self).__init__()
         self.self = BertSelfAttention(i, config, weights, biases)
@@ -601,6 +589,7 @@ class BertAttention(nn.Module):
 
 
 class BertIntermediate(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertIntermediate, self).__init__()
         self.dense_act = LinearActivation(config.hidden_size,
@@ -615,6 +604,7 @@ class BertIntermediate(nn.Module):
 
 
 class BertOutput(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -641,6 +631,7 @@ class BertOutput(nn.Module):
 
 
 class BertLayer(nn.Module):
+
     def __init__(self, i, config, weights, biases):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(i, config, weights, biases)
@@ -674,26 +665,14 @@ class BertLayer(nn.Module):
             self.biases[2].register_hook(lambda x, self=self: grads.append([x, "V_B"]))
             self.weight[3].register_hook(lambda x, self=self: grads.append([x, "O_W"]))
             self.biases[3].register_hook(lambda x, self=self: grads.append([x, "O_B"]))
-            self.PostAttentionLayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_W"]))
-            self.PostAttentionLayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "N2_B"]))
+            self.PostAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "N2_W"]))
+            self.PostAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "N2_B"]))
             self.weight[5].register_hook(lambda x, self=self: grads.append([x, "int_W"]))
             self.biases[5].register_hook(lambda x, self=self: grads.append([x, "int_B"]))
             self.weight[6].register_hook(lambda x, self=self: grads.append([x, "out_W"]))
             self.biases[6].register_hook(lambda x, self=self: grads.append([x, "out_B"]))
-            self.PreAttentionLayerNorm.weight.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_W"]))
-            self.PreAttentionLayerNorm.bias.register_hook(
-                lambda x,
-                self=self: grads.append([x,
-                                         "norm_B"]))
+            self.PreAttentionLayerNorm.weight.register_hook(lambda x, self=self: grads.append([x, "norm_W"]))
+            self.PreAttentionLayerNorm.bias.register_hook(lambda x, self=self: grads.append([x, "norm_B"]))
 
         return layer_output + intermediate_input
 
@@ -702,17 +681,14 @@ class BertLayer(nn.Module):
 
 
 class BertEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(BertEncoder, self).__init__()
         #layer = BertLayer(config, weights, biases)
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
-        self.layer = nn.ModuleList([
-            copy.deepcopy(BertLayer(i,
-                                    config,
-                                    weights,
-                                    biases)) for i in range(config.num_hidden_layers)
-        ])
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(BertLayer(i, config, weights, biases)) for i in range(config.num_hidden_layers)])
         self.grads = []
         self.graph = []
 
@@ -734,14 +710,11 @@ class BertEncoder(nn.Module):
             self.graph.append(mdl)
             self.get_modules(self, mdl, input)
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -756,23 +729,13 @@ class BertEncoder(nn.Module):
             num_layers = len(self.layer)
             chunk_length = math.ceil(math.sqrt(num_layers))
             while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
+                hidden_states = checkpoint.checkpoint(custom(l, l + chunk_length), hidden_states, attention_mask * 1)
                 l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             self.grads,
-                                             collect_all_grads=True)
-                hidden_states.register_hook(
-                    lambda x,
-                    i=i,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, self.grads, collect_all_grads=True)
+                hidden_states.register_hook(lambda x, i=i, self=self: self.grads.append([x, "hidden_state"]))
                 #print("pytorch weight is: ", layer_module.get_w())
 
                 if output_all_encoded_layers:
@@ -802,11 +765,10 @@ class BertEncoder(nn.Module):
 
 
 class BertPooler(nn.Module):
+
     def __init__(self, config):
         super(BertPooler, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act="tanh")
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act="tanh")
 
     def forward(self, hidden_states):
         # We "pool" the model by simply taking the hidden state corresponding
@@ -817,11 +779,10 @@ class BertPooler(nn.Module):
 
 
 class BertPredictionHeadTransform(nn.Module):
+
     def __init__(self, config):
         super(BertPredictionHeadTransform, self).__init__()
-        self.dense_act = LinearActivation(config.hidden_size,
-                                          config.hidden_size,
-                                          act=config.hidden_act)
+        self.dense_act = LinearActivation(config.hidden_size, config.hidden_size, act=config.hidden_act)
         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 
     def forward(self, hidden_states):
@@ -831,6 +792,7 @@ class BertPredictionHeadTransform(nn.Module):
 
 
 class BertLMPredictionHead(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertLMPredictionHead, self).__init__()
         self.transform = BertPredictionHeadTransform(config)
@@ -845,16 +807,15 @@ class BertLMPredictionHead(nn.Module):
 
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
-        get_accelerator().range_push(
-            "decoder input.size() = {}, weight.size() = {}".format(
-                hidden_states.size(),
-                self.decoder.weight.size()))
+        get_accelerator().range_push("decoder input.size() = {}, weight.size() = {}".format(
+            hidden_states.size(), self.decoder.weight.size()))
         hidden_states = self.decoder(hidden_states) + self.bias
         get_accelerator().range_pop()
         return hidden_states
 
 
 class BertOnlyMLMHead(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertOnlyMLMHead, self).__init__()
         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
@@ -865,6 +826,7 @@ class BertOnlyMLMHead(nn.Module):
 
 
 class BertOnlyNSPHead(nn.Module):
+
     def __init__(self, config):
         super(BertOnlyNSPHead, self).__init__()
         self.seq_relationship = nn.Linear(config.hidden_size, 2)
@@ -875,6 +837,7 @@ class BertOnlyNSPHead(nn.Module):
 
 
 class BertPreTrainingHeads(nn.Module):
+
     def __init__(self, config, bert_model_embedding_weights):
         super(BertPreTrainingHeads, self).__init__()
         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
@@ -890,15 +853,14 @@ class BertPreTrainedModel(nn.Module):
     """ An abstract class to handle weights initialization and
         a simple interface for downloading and loading pretrained models.
     """
+
     def __init__(self, config, *inputs, **kwargs):
         super(BertPreTrainedModel, self).__init__()
         if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__,
-                    self.__class__.__name__))
+            raise ValueError("Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                             "To create a model from a Google pretrained model use "
+                             "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                                 self.__class__.__name__, self.__class__.__name__))
         self.config = config
 
     def init_bert_weights(self, module):
@@ -955,9 +917,8 @@ class BertPreTrainedModel(nn.Module):
         if resolved_archive_file == archive_file:  # noqa: F821
             logger.info("loading archive file {}".format(archive_file))
         else:
-            logger.info("loading archive file {} from cache at {}".format(
-                archive_file,
-                resolved_archive_file))  # noqa: F821
+            logger.info("loading archive file {} from cache at {}".format(archive_file,
+                                                                          resolved_archive_file))  # noqa: F821
         tempdir = None
         if os.path.isdir(resolved_archive_file) or from_tf:  # noqa: F821
             serialization_dir = resolved_archive_file  # noqa: F821
@@ -978,9 +939,7 @@ class BertPreTrainedModel(nn.Module):
         model = cls(config, *inputs, **kwargs)
         if state_dict is None and not from_tf:
             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
-            state_dict = torch.load(
-                weights_path,
-                map_location='cpu' if not get_accelerator().is_available() else None)
+            state_dict = torch.load(weights_path, map_location='cpu' if not get_accelerator().is_available() else None)
         if tempdir:
             # Clean up temp dir
             shutil.rmtree(tempdir)
@@ -1014,34 +973,25 @@ class BertPreTrainedModel(nn.Module):
 
         def load(module, prefix=''):
             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(state_dict,
-                                         prefix,
-                                         local_metadata,
-                                         True,
-                                         missing_keys,
-                                         unexpected_keys,
+            module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys,
                                          error_msgs)
             for name, child in module._modules.items():
                 if child is not None:
                     load(child, prefix + name + '.')
 
         start_prefix = ''
-        if not hasattr(model,
-                       'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
             start_prefix = 'bert.'
         load(model, prefix=start_prefix)
         if len(missing_keys) > 0:
             logger.info("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__,
-                missing_keys))
+                model.__class__.__name__, missing_keys))
         if len(unexpected_keys) > 0:
-            logger.info("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__,
-                unexpected_keys))
+            logger.info("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__,
+                                                                                  unexpected_keys))
         if len(error_msgs) > 0:
             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
-                model.__class__.__name__,
-                "\n\t".join(error_msgs)))
+                model.__class__.__name__, "\n\t".join(error_msgs)))
         return model
 
 
@@ -1089,6 +1039,7 @@ class BertModel(BertPreTrainedModel):
     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertModel, self).__init__(config)
         self.embeddings = BertEmbeddings(config)
@@ -1119,16 +1070,14 @@ class BertModel(BertPreTrainedModel):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(
-            self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(
-            embedding_output,
-            extended_attention_mask,
-            output_all_encoded_layers=output_all_encoded_layers,
-            checkpoint_activations=checkpoint_activations)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      checkpoint_activations=checkpoint_activations)
         sequence_output = encoded_layers[-1]
         pooled_output = self.pooler(sequence_output)
         if not output_all_encoded_layers:
@@ -1186,6 +1135,7 @@ class BertForPreTraining(BertPreTrainedModel):
     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, args):
         super(BertForPreTraining, self).__init__(config)
         self.summary_writer = None
@@ -1194,17 +1144,14 @@ class BertForPreTraining(BertPreTrainedModel):
         self.samples_per_step = dist.get_world_size() * args.train_batch_size
         self.sample_count = self.samples_per_step
         self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config,
-                                        self.bert.embeddings.word_embeddings.weight)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
         self.apply(self.init_bert_weights)
 
     def log_summary_writer(self, logs: dict, base='Train'):
         if dist.get_rank() == 0:
             module_name = "Samples"  #self._batch_module_name.get(batch_type, self._get_batch_type_error(batch_type))
             for key, log in logs.items():
-                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}',
-                                               log,
-                                               self.sample_count)
+                self.summary_writer.add_scalar(f'{base}/{module_name}/{key}', log, self.sample_count)
             self.sample_count += self.samples_per_step
 
     def forward(self, batch, log=True):
@@ -1216,18 +1163,17 @@ class BertForPreTraining(BertPreTrainedModel):
         next_sentence_label = batch[4]
         checkpoint_activations = False
 
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                                   output_all_encoded_layers=False, checkpoint_activations=checkpoint_activations)
+        sequence_output, pooled_output = self.bert(input_ids,
+                                                   token_type_ids,
+                                                   attention_mask,
+                                                   output_all_encoded_layers=False,
+                                                   checkpoint_activations=checkpoint_activations)
         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 
         if masked_lm_labels is not None and next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             #print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
             total_loss = masked_lm_loss + next_sentence_loss
             #            if log:
@@ -1279,6 +1225,7 @@ class BertForMaskedLM(BertPreTrainedModel):
     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForMaskedLM, self).__init__(config)
         self.bert = BertModel(config)
@@ -1291,15 +1238,12 @@ class BertForMaskedLM(BertPreTrainedModel):
                 attention_mask=None,
                 masked_lm_labels=None,
                 checkpoint_activations=False):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
-                                       output_all_encoded_layers=False)
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         prediction_scores = self.cls(sequence_output)
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            masked_lm_loss = loss_fct(prediction_scores.view(-1,
-                                                             self.config.vocab_size),
-                                      masked_lm_labels.view(-1))
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             return masked_lm_loss
         else:
             return prediction_scores
@@ -1348,6 +1292,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForNextSentencePrediction, self).__init__(config)
         self.bert = BertModel(config)
@@ -1360,15 +1305,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
                 attention_mask=None,
                 next_sentence_label=None,
                 checkpoint_activations=False):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
-                                     output_all_encoded_layers=False)
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         seq_relationship_score = self.cls(pooled_output)
 
         if next_sentence_label is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1,
-                                                                      2),
-                                          next_sentence_label.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
             return next_sentence_loss
         else:
             return seq_relationship_score
@@ -1419,6 +1361,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_labels):
         super(BertForSequenceClassification, self).__init__(config)
         self.num_labels = num_labels
@@ -1427,12 +1370,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
@@ -1489,6 +1427,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_choices):
         super(BertForMultipleChoice, self).__init__(config)
         self.num_choices = num_choices
@@ -1497,16 +1436,14 @@ class BertForMultipleChoice(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, 1)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
-        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
+        _, pooled_output = self.bert(flat_input_ids,
+                                     flat_token_type_ids,
+                                     flat_attention_mask,
+                                     output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
         logits = self.classifier(pooled_output)
         reshaped_logits = logits.view(-1, self.num_choices)
@@ -1564,6 +1501,7 @@ class BertForTokenClassification(BertPreTrainedModel):
     logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config, num_labels):
         super(BertForTokenClassification, self).__init__(config)
         self.num_labels = num_labels
@@ -1572,12 +1510,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         self.classifier = nn.Linear(config.hidden_size, num_labels)
         self.apply(self.init_bert_weights)
 
-    def forward(self,
-                input_ids,
-                token_type_ids=None,
-                attention_mask=None,
-                labels=None,
-                checkpoint_activations=False):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         sequence_output = self.dropout(sequence_output)
         logits = self.classifier(sequence_output)
@@ -1644,6 +1577,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
     ```
     """
+
     def __init__(self, config):
         super(BertForQuestionAnswering, self).__init__(config)
         self.bert = BertModel(config)
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index fe5359249dc802fe014ccc10fa5d24fee8007fd1..83894b2968921f58adde24282023357efebf16db 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -17,13 +20,7 @@ class TestMoE(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
         # E+D -- ep_size = 2
@@ -36,10 +33,7 @@ class TestMoE(DistributedTest):
                                               dist_init_required=False)
         #dist_init_required=False -- parameterize to True/False?
 
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
+        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
 
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -55,13 +49,7 @@ class TestPRMoE(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
         # E+D -- ep_size = 2
@@ -73,10 +61,7 @@ class TestPRMoE(DistributedTest):
                                               optimizer=optimizer,
                                               dist_init_required=False)
 
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
+        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
 
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
index ba63a102a0edf6a67e2c085bf0035c13e2f00da5..e53e0dc1effcfa18118d574cf14061f8721c963d 100644
--- a/tests/unit/moe/test_moe_tp.py
+++ b/tests/unit/moe/test_moe_tp.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -9,6 +12,7 @@ from deepspeed.moe.layer import MoE
 
 
 class MPU():
+
     def __init__(self, tp_world_size):
         self.rank = deepspeed.comm.get_rank()
         self.world_size = deepspeed.comm.get_world_size()
@@ -57,21 +61,12 @@ class TestMOETensorParallel(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 8,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 8, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 16
 
-        tensor_parallel_expert = torch.nn.Sequential(
-            torch.nn.Linear(hidden_dim,
-                            4 * hidden_dim // tp_size),
-            torch.nn.ReLU(),
-            torch.nn.Linear(4 * hidden_dim // tp_size,
-                            hidden_dim))
+        tensor_parallel_expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, 4 * hidden_dim // tp_size),
+                                                     torch.nn.ReLU(),
+                                                     torch.nn.Linear(4 * hidden_dim // tp_size, hidden_dim))
 
         # set num experts to world size
         world_size = deepspeed.comm.get_world_size()
@@ -92,7 +87,6 @@ class TestMOETensorParallel(DistributedTest):
 
         assert model.num_local_experts == world_size // ep_size
         if enable_expert_tp:
-            assert deepspeed.utils.groups._get_expert_model_parallel_world_size(
-            ) == tp_size
+            assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == tp_size
         else:
             assert deepspeed.utils.groups._get_expert_model_parallel_world_size() == 1
diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py
index 7cf10619661c972e359f067bdc34ecc9cec56e58..3e04bebfb6c1459499a792d9ca53559e5812b693 100644
--- a/tests/unit/monitor/test_monitor.py
+++ b/tests/unit/monitor/test_monitor.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.monitor.tensorboard import TensorBoardMonitor
 from deepspeed.monitor.wandb import WandbMonitor
diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py
index 8993813aa5453533cc4cbd05e165d8c47f536008..e84215fb4e9575b45085affa4d897a5583656f1b 100644
--- a/tests/unit/multi_output_model.py
+++ b/tests/unit/multi_output_model.py
@@ -1,9 +1,13 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 
 
 class MultiOutputModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, weight_value):
         super(MultiOutputModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
@@ -24,19 +28,11 @@ def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, ta
     batch_size = model.train_micro_batch_size_per_gpu()
 
     train_data = [
-        torch.full(size=(total_samples,
-                         hidden_dim),
-                   fill_value=x,
-                   device=device,
-                   dtype=torch.half,
-                   requires_grad=True) for x in inputs
+        torch.full(size=(total_samples, hidden_dim), fill_value=x, device=device, dtype=torch.half, requires_grad=True)
+        for x in inputs
     ]
 
-    train_label = [
-        torch.empty(total_samples,
-                    device=device,
-                    dtype=torch.long).fill_(y) for y in targets
-    ]
+    train_label = [torch.empty(total_samples, device=device, dtype=torch.long).fill_(y) for y in targets]
 
     train_dataset = torch.utils.data.TensorDataset(*train_data, *train_label)
     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py
index ad26daeb698c40bb22a7688d9ff2bb3f52c14ca8..4c5719bb9c1e055f04a64c582025b944184b9eae 100644
--- a/tests/unit/ops/accelerators/test_accelerator_backward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_backward.py
@@ -1,6 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-import math
 import numpy as np
 import torch
 import pytest
@@ -91,26 +93,21 @@ kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
 
 
 class DSEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(DSEncoder, self).__init__()
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
         self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
+            copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)
         ])
         self.grads = []
         self.pre_or_post = config.pre_layer_norm
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -121,25 +118,23 @@ class DSEncoder(nn.Module):
             return custom_forward
 
         if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
+            raise NotImplementedError("`checkpoint` is not defined below")
+            #l = 0
+            #num_layers = len(self.layer)
+            #chunk_length = math.ceil(math.sqrt(num_layers))
+            #while l < num_layers:
+            #    hidden_states = checkpoint.checkpoint(
+            #        custom(
+            #            l,  # noqa: F821
+            #            l + chunk_length),
+            #        hidden_states,
+            #        attention_mask * 1)
+            #    l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             grads=self.grads)
-                hidden_states.register_hook(
-                    lambda x,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
+                hidden_states = layer_module(hidden_states, attention_mask, grads=self.grads)
+                hidden_states.register_hook(lambda x, self=self: self.grads.append([x, "hidden_state"]))
 
                 if output_all_encoded_layers:
                     all_encoder_layers.append(hidden_states)
@@ -171,20 +166,14 @@ def create_models(ds_config):
     biases = []
 
     for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
+        weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size)))
         weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
 
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size)))
     weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size)))
     weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[7].data.fill_(1.0)
@@ -229,10 +218,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 
     # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(ds_config.batch_size,
-                                seq_len,
-                                ds_config.hidden_size,
-                                **kwargs)
+    hidden_states = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
     input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
     Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
 
@@ -247,10 +233,7 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
     base_grads = bert_encoder.get_grads()
 
     # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
+    ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False)
 
     loss = (Y - ds_results[0]).pow(2).sum() / 64
     loss.backward()
@@ -280,18 +263,9 @@ def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
 class TestCUDABackward(DistributedTest):
     world_size = 1
 
-    def test_backward(self,
-                      batch_size,
-                      hidden_size,
-                      seq_len,
-                      heads,
-                      num_layers,
-                      is_preln,
-                      use_fp16,
-                      atol):
+    def test_backward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol):
         # Only run fp16 test cases on devices with FP16 capability.
-        if not get_accelerator().is_fp16_supported() and (use_fp16 is True
-                                                          or is_preln is False):
+        if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False):
             return
 
         ds_config = DeepSpeedTransformerConfig()
diff --git a/tests/unit/ops/accelerators/test_accelerator_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py
index 83ff70b9bcf1b885a487cc3e98f2f25a0ea3fc92..7c5580e4676ac1c05b026bf2ba822289aaa9b324 100644
--- a/tests/unit/ops/accelerators/test_accelerator_forward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_forward.py
@@ -1,6 +1,8 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
-import math
 import numpy as np
 import torch
 import pytest
@@ -38,26 +40,21 @@ kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
 
 
 class DSEncoder(nn.Module):
+
     def __init__(self, config, weights, biases):
         super(DSEncoder, self).__init__()
         self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
         self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
+            copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)
         ])
         self.grads = []
         self.pre_or_post = config.pre_layer_norm
 
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
         all_encoder_layers = []
 
         def custom(start, end):
+
             def custom_forward(*inputs):
                 layers = self.layer[start:end]
                 x_ = inputs[0]
@@ -68,15 +65,18 @@ class DSEncoder(nn.Module):
             return custom_forward
 
         if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,  # noqa: F821
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
+            raise NotImplementedError("`checkpoint` below is not defined")
+            #l = 0
+            #num_layers = len(self.layer)
+            #chunk_length = math.ceil(math.sqrt(num_layers))
+            #while l < num_layers:
+            #    hidden_states = checkpoint.checkpoint(
+            #        custom(
+            #            l,  # noqa: F821
+            #            l + chunk_length),
+            #        hidden_states,
+            #        attention_mask * 1)
+            #    l += chunk_length
             # decoder layers
         else:
             for i, layer_module in enumerate(self.layer):
@@ -111,20 +111,14 @@ def create_models(ds_config):
     biases = []
 
     for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
+        weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.hidden_size)))
         weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
 
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size, ds_config.hidden_size)))
     weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
+    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size, ds_config.intermediate_size)))
     weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
     weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
     weights[7].data.fill_(1.0)
@@ -166,12 +160,10 @@ def set_seed(seed):
 def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
     set_seed(123)
     bert_encoder, ds_encoder = create_models(ds_config)
-    print("bert_model:11111111111")
-    print(bert_encoder)
-    print("ds_model:2222222222222")
-    print(ds_encoder)
+
     bsz = ds_config.batch_size if test_bsz is None else test_bsz
-    #prepare test data
+
+    # prepare test data
     kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
     hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
     input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
@@ -183,10 +175,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
                                 checkpoint_activations=False)
 
     # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
+    ds_results = ds_encoder(hidden_states, input_mask, output_all_encoded_layers=False, checkpoint_activations=False)
 
     # check forward evaluation
     check_equal(base_results, ds_results, atol=atol, verbose=verbose)
@@ -236,14 +225,7 @@ def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
 class TestCUDAForward(DistributedTest):
     world_size = 1
 
-    def test_forward(self,
-                     batch_size,
-                     hidden_size,
-                     seq_len,
-                     heads,
-                     num_layers,
-                     is_preln,
-                     use_fp16):
+    def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
@@ -274,14 +256,7 @@ class TestCUDAForward(DistributedTest):
 class TestCUDAForwardSmallBatchSize(DistributedTest):
     world_size = 1
 
-    def test_forward_with_small_bsz(self,
-                                    batch_size,
-                                    small_bsz,
-                                    hidden_size,
-                                    seq_len,
-                                    heads,
-                                    num_layers,
-                                    is_preln,
+    def test_forward_with_small_bsz(self, batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln,
                                     use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
@@ -312,14 +287,7 @@ class TestCUDAForwardSmallBatchSize(DistributedTest):
 class TestCUDAForwardStochastic(DistributedTest):
     world_size = 1
 
-    def test_forward_stochastic(self,
-                                batch_size,
-                                hidden_size,
-                                seq_len,
-                                heads,
-                                num_layers,
-                                is_preln,
-                                use_fp16):
+    def test_forward_stochastic(self, batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and use_fp16 is True:
             return
diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py
index 17001e6bd02124abd5e0b3cf8bea720fa9290296..d38d4221787257a52a9d73c51a0628b311593235 100644
--- a/tests/unit/ops/adagrad/test_cpu_adagrad.py
+++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import numpy as np
@@ -74,24 +77,13 @@ class TestCPUAdagrad(DistributedTest):
         rng_state = torch.get_rng_state()
 
         def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
-            i = torch.randint(vocabulary_size,
-                              size=(1,
-                                    num_indices),
-                              dtype=torch.int64,
-                              device=device)
+            i = torch.randint(vocabulary_size, size=(1, num_indices), dtype=torch.int64, device=device)
             v = torch.randn(num_indices, dim, dtype=dtype, device=device)
             t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
             t = t.coalesce()
-            new_i = (t.indices().view(-1,
-                                      1).repeat(1,
-                                                dim) * dim +
-                     torch.tensor(range(dim))).flatten().unsqueeze(0)
+            new_i = (t.indices().view(-1, 1).repeat(1, dim) * dim + torch.tensor(range(dim))).flatten().unsqueeze(0)
             new_v = t.values().flatten()
-            new_t = torch.sparse_coo_tensor(new_i,
-                                            new_v,
-                                            (vocabulary_size * dim,
-                                             ),
-                                            device=device)
+            new_t = torch.sparse_coo_tensor(new_i, new_v, (vocabulary_size * dim, ), device=device)
             new_t = new_t.coalesce()
             new_t.requires_grad = False
             return new_t
@@ -101,17 +93,9 @@ class TestCPUAdagrad(DistributedTest):
         num_indices = int(model_size // dim)
         dtype = torch.float32
 
-        param = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                                ),
-                                               dtype=dtype,
-                                               device=device),
-                                   requires_grad=True)
+        param = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True)
         torch.set_rng_state(rng_state)
-        param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                                 ),
-                                                dtype=dtype,
-                                                device=device),
-                                    requires_grad=True)
+        param1 = torch.nn.Parameter(torch.randn((voc_size * dim, ), dtype=dtype, device=device), requires_grad=True)
         torch.set_rng_state(rng_state)
 
         optimizer = DeepSpeedCPUAdagrad([param])
@@ -119,17 +103,9 @@ class TestCPUAdagrad(DistributedTest):
 
         for i in range(10):
             torch.set_rng_state(rng_state)
-            param.grad = gen_sparse_grad(voc_size,
-                                         dim,
-                                         num_indices,
-                                         dtype=dtype,
-                                         device=device)
+            param.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device)
             torch.set_rng_state(rng_state)
-            param1.grad = gen_sparse_grad(voc_size,
-                                          dim,
-                                          num_indices,
-                                          dtype=dtype,
-                                          device=device)
+            param1.grad = gen_sparse_grad(voc_size, dim, num_indices, dtype=dtype, device=device)
             optimizer.step()
             optimizer1.step()
 
@@ -137,6 +113,7 @@ class TestCPUAdagrad(DistributedTest):
 
 
 class TestCPUAdagradGPUError(DistributedTest):
+
     def test_cpu_adagrad_gpu_error(self):
         model_size = 64
         device = get_accelerator().device_name(0)  # 'cuda:0' or 'xpu:0'
diff --git a/tests/unit/ops/adam/test_adamw.py b/tests/unit/ops/adam/test_adamw.py
index 03a7c3ca32669512383238b8fdd7b427cbb71615..8b6f8101cb776653731e3e27ea00636584baf6f7 100644
--- a/tests/unit/ops/adam/test_adamw.py
+++ b/tests/unit/ops/adam/test_adamw.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 import torch
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
index d10fb98105a807c599e4f037946be587f052ebdd..a48b7c7f2839be121799c2c85e44bc859fb79a67 100644
--- a/tests/unit/ops/adam/test_cpu_adam.py
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import numpy as np
@@ -31,17 +34,13 @@ def check_equal(first, second, atol=1e-2, verbose=False):
 def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
     for i in range(10):
         param1.grad = torch.randn(model_size, device=param1.device).to(param1.dtype)
-        param2.grad = param1.grad.clone().detach().to(device=param2.device,
-                                                      dtype=param2.dtype)
+        param2.grad = param1.grad.clone().detach().to(device=param2.device, dtype=param2.dtype)
 
         optimizer1.step()
         optimizer2.step()
 
     tolerance = param1.float().norm().detach().numpy() * 1e-2
-    check_equal(param1.float().norm(),
-                param2.float().cpu().norm(),
-                atol=tolerance,
-                verbose=True)
+    check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
 
 
 @pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
@@ -61,8 +60,7 @@ class TestCPUAdam(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    @pytest.mark.skipif(not get_accelerator().is_available(),
-                        reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
     def test_fused_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
@@ -95,9 +93,7 @@ class TestCPUAdam(DistributedTest):
             ref_param_device = get_accelerator().device_name()
         else:
             if dtype == torch.half:
-                pytest.skip(
-                    "torch.optim.AdamW with half precision only supported in CUDA environments."
-                )
+                pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.")
             ref_param_device = 'cpu'
 
             from deepspeed.ops.adam import DeepSpeedCPUAdam
@@ -117,6 +113,7 @@ class TestCPUAdam(DistributedTest):
 
 
 class TestCPUAdamGPUError(DistributedTest):
+
     def test_cpu_adam_gpu_error(self):
         model_size = 64
         from deepspeed.ops.adam import DeepSpeedCPUAdam
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index 86265ab15ef941f317b9bb3158c84a14d26d30f6..a37bcd9c869b05138f4b2f1324ee22dddac44b6d 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import os
@@ -85,29 +88,15 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self,
-                           tmpdir,
-                           use_cuda_pinned_tensor,
-                           single_submit,
-                           overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
-            aio_buffer = get_accelerator().pin_memory(
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
-            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
-                                                 torch.empty(0,
-                                                             dtype=torch.uint8))
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
 
         _validate_handle_state(h, single_submit, overlap_events)
 
@@ -123,35 +112,18 @@ class TestRead(DistributedTest):
             h.free_cpu_locked_tensor(aio_buffer)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self,
-                        tmpdir,
-                        use_cuda_pinned_tensor,
-                        single_submit,
-                        overlap_events,
-                        cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if cuda_device:
-            aio_buffer = torch.empty(IO_SIZE,
-                                     dtype=torch.uint8,
-                                     device=get_accelerator().device_name())
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
         elif use_cuda_pinned_tensor:
-            aio_buffer = get_accelerator().pin_memory(
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device='cpu'))
+            aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
-            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE,
-                                                 torch.empty(0,
-                                                             dtype=torch.uint8))
+            aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -181,20 +153,11 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self,
-                            tmpdir,
-                            use_cuda_pinned_tensor,
-                            single_submit,
-                            overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
+        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         if use_cuda_pinned_tensor:
             aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
@@ -215,22 +178,12 @@ class TestWrite(DistributedTest):
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
     @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self,
-                         tmpdir,
-                         use_cuda_pinned_tensor,
-                         single_submit,
-                         overlap_events,
-                         cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
@@ -269,8 +222,7 @@ class TestAsyncQueue(DistributedTest):
 
     @pytest.mark.parametrize("async_queue", [2, 3])
     def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -279,33 +231,22 @@ class TestAsyncQueue(DistributedTest):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
         if cuda_device:
             aio_buffers = [
-                torch.empty(IO_SIZE,
-                            dtype=torch.uint8,
-                            device=get_accelerator().device_name())
+                torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
                 for _ in range(async_queue)
             ]
         elif use_cuda_pinned_tensor:
             aio_buffers = [
-                get_accelerator().pin_memory(
-                    torch.empty(IO_SIZE,
-                                dtype=torch.uint8,
-                                device='cpu')) for _ in range(async_queue)
+                get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
+                for _ in range(async_queue)
             ]
         else:
             tmp_tensor = torch.empty(0, dtype=torch.uint8)
-            aio_buffers = [
-                h.new_cpu_locked_tensor(IO_SIZE,
-                                        tmp_tensor) for _ in range(async_queue)
-            ]
+            aio_buffers = [h.new_cpu_locked_tensor(IO_SIZE, tmp_tensor) for _ in range(async_queue)]
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
@@ -328,8 +269,7 @@ class TestAsyncQueue(DistributedTest):
 
     @pytest.mark.parametrize("async_queue", [2, 3])
     def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device,
-                                      use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -340,11 +280,7 @@ class TestAsyncQueue(DistributedTest):
 
         single_submit = True
         overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         aio_files = []
         aio_buffers = []
diff --git a/tests/unit/ops/quantizer/test_dequantize.py b/tests/unit/ops/quantizer/test_dequantize.py
deleted file mode 100644
index 5dc2f7d68a7044ee3283de15ec7dae7eba700767..0000000000000000000000000000000000000000
--- a/tests/unit/ops/quantizer/test_dequantize.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
-
-import pytest
-import torch
-from deepspeed.ops import op_builder
-from deepspeed.accelerator import get_accelerator
-
-quantize_module = None
-
-
-def int4x2to2xint4(int4X2tensor):
-    high = int4X2tensor >> 4
-    low = (int4X2tensor << 4) >> 4
-    return torch.stack((high, low), dim=-1).flatten()
-
-
-def run_quantize(data, num_groups, q_bits, is_symmetric_quant):
-    global quantize_module
-    if quantize_module is None:
-        quantize_module = op_builder.QuantizerBuilder().load()
-
-    return quantize_module.quantize(
-        data,
-        num_groups,
-        q_bits,
-        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
-
-
-def run_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
-    global quantize_module
-    if quantize_module is None:
-        quantize_module = op_builder.QuantizerBuilder().load()
-
-    return quantize_module.dequantize(
-        quantized_data,
-        params,
-        num_groups,
-        q_bits,
-        quantize_module.Symmetric if is_symmetric_quant else quantize_module.Asymmetric)
-
-
-def run_ref_dequantize(quantized_data, params, num_groups, q_bits, is_symmetric_quant):
-
-    if (q_bits == 4):
-        quantized_data = int4x2to2xint4(quantized_data)
-
-    quantized_data = quantized_data.reshape(num_groups, -1).to(torch.float32)
-
-    if is_symmetric_quant:
-        return (quantized_data * params).to(torch.float16)
-    else:
-        scales = params[:, 0].reshape(-1, 1)
-        offsets = params[:, 1].reshape(-1, 1)
-        return (quantized_data * scales + offsets).to(torch.float16)
-
-
-@pytest.mark.inference_ops
-@pytest.mark.parametrize("num_groups", [1, 13, 512])
-@pytest.mark.parametrize("num_elems",
-                         [8,
-                          16,
-                          32,
-                          64,
-                          128,
-                          256,
-                          4096,
-                          8192,
-                          12288,
-                          16384])
-@pytest.mark.parametrize("is_symmetric_quant", [True, False])
-@pytest.mark.parametrize("q_bits", [4, 8])
-def test_dequantize(num_elems, num_groups, is_symmetric_quant, q_bits):
-
-    activations = torch.randn((num_groups,
-                               num_elems),
-                              dtype=torch.float16,
-                              device=get_accelerator().device_name())
-    quantized_data, params = run_quantize(activations, num_groups, q_bits, is_symmetric_quant)
-
-    ds_dequant = run_dequantize(quantized_data,
-                                params,
-                                num_groups,
-                                q_bits,
-                                is_symmetric_quant)
-    ref_dequant = run_ref_dequantize(quantized_data,
-                                     params,
-                                     num_groups,
-                                     q_bits,
-                                     is_symmetric_quant)
-
-    assert (torch.allclose(ds_dequant.flatten(),
-                           ref_dequant.flatten(),
-                           rtol=3e-2,
-                           atol=2e-3))
diff --git a/tests/unit/ops/quantizer/test_fake_quantization.py b/tests/unit/ops/quantizer/test_fake_quantization.py
index c5304f7694eeee5de4dcae5b0ff56e59f114836e..a9c2993ef7c21e3d11e2e0bb5ed6423318037407 100644
--- a/tests/unit/ops/quantizer/test_fake_quantization.py
+++ b/tests/unit/ops/quantizer/test_fake_quantization.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -45,8 +48,7 @@ def run_quant_dequant(inputs, groups, bits):
 # Note that we have an explicit boundary for groups as ((size / groups) - 1) / 4096 + 1) <= MAX_REG.
 def test_fake_quant_dequant(tensor_shape, groups):
 
-    input_tensor = torch.rand((tensor_shape),
-                              dtype=torch.float16).to(get_accelerator().device_name())
+    input_tensor = torch.rand((tensor_shape), dtype=torch.float16).to(get_accelerator().device_name())
 
     # 8-bit quantization.
     ref_input_8bit = input_tensor.clone().detach()
diff --git a/tests/unit/ops/quantizer/test_quantize.py b/tests/unit/ops/quantizer/test_quantize.py
index 3cfd812e63f97832a18893b0548788287ff521e0..6701e9b376a8ffcc282c6ef8b36f1424adf99fe4 100644
--- a/tests/unit/ops/quantizer/test_quantize.py
+++ b/tests/unit/ops/quantizer/test_quantize.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -15,12 +16,21 @@ def run_quantize_ds(activations, num_groups, q_bits, is_symmetric_quant):
     if inference_module is None:
         inference_module = op_builder.QuantizerBuilder().load()
 
-    return inference_module.quantize(
+    return inference_module.quantize(activations, num_groups, q_bits,
+                                     inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric)
+
+
+def run_dequantize_ds(activations, params, num_groups, q_bits, is_symmetric_quant):
+    global inference_module
+    if inference_module is None:
+        inference_module = op_builder.QuantizerBuilder().load()
+    return inference_module.dequantize(
         activations,
+        params,
         num_groups,
         q_bits,
-        inference_module.Symmetric
-        if is_symmetric_quant else inference_module.Asymmetric)
+        inference_module.Symmetric if is_symmetric_quant else inference_module.Asymmetric,
+    )
 
 
 def get_q_props(q_bits):
@@ -33,13 +43,7 @@ def get_q_props(q_bits):
     return q_range, q_max, q_min
 
 
-def get_scale_zero_point(q_bits,
-                         is_symmetric_quant,
-                         max,
-                         min,
-                         absmax,
-                         scales=None,
-                         zero_points=None):
+def get_scale_zero_point(q_bits, is_symmetric_quant, max, min, absmax, scales=None, zero_points=None):
 
     q_range, q_max, q_min = get_q_props(q_bits)
 
@@ -47,14 +51,11 @@ def get_scale_zero_point(q_bits,
         scale = torch.empty_like(absmax)
         for i, x in enumerate(absmax):
             scale[i] = torch.ones_like(x) if x == 0 else q_range / (2 * x)
-        zero_point = torch.zeros(scale.shape,
-                                 dtype=torch.float32,
-                                 device=get_accelerator().device_name())
+        zero_point = torch.zeros(scale.shape, dtype=torch.float32, device=get_accelerator().device_name())
     else:
         scale = torch.empty_like(max)
         for i, x in enumerate(max):
-            scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] -
-                                                                              min[i])
+            scale[i] = torch.ones_like(x) if max[i] == min[i] else q_range / (max[i] - min[i])
         zero_point = q_min - (min * scale)
 
     return scale, zero_point
@@ -73,15 +74,14 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
 
     activations_ref = activations_ref.reshape(num_groups, -1).to(dtype=torch.float32)
 
-    max_abs_activations_ref = torch.amax(torch.abs(activations_ref),
-                                         dim=-1).view(num_groups,
-                                                      -1)
+    max_abs_activations_ref = torch.amax(torch.abs(activations_ref), dim=-1).view(num_groups, -1)
     max_activations_ref = torch.amax(activations_ref, dim=-1).view(num_groups, -1)
     min_activations_ref = torch.amin(activations_ref, dim=-1).view(num_groups, -1)
 
     _, q_max, q_min = get_q_props(q_bits)
 
-    scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref, max_abs_activations_ref)
+    scale, zero_point = get_scale_zero_point(q_bits, is_symmetric_quant, max_activations_ref, min_activations_ref,
+                                             max_abs_activations_ref)
 
     data_f = activations_ref * scale
 
@@ -90,9 +90,7 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
 
     data_i32 = torch.round(data_f).to(dtype=torch.int32)
 
-    data_i32 = torch.minimum(torch.maximum(data_i32,
-                                           q_min.expand_as(data_i32)),
-                             q_max.expand_as(data_i32))
+    data_i32 = torch.minimum(torch.maximum(data_i32, q_min.expand_as(data_i32)), q_max.expand_as(data_i32))
     data_i8 = data_i32.to(dtype=torch.int8)
 
     scales = (1.0 / scale).reshape(-1, 1)
@@ -102,61 +100,51 @@ def run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups):
     return data_i8, params
 
 
+def run_float_dequantize(q_bits, is_symmetric_quant, data_i8, params, num_groups):
+    data_f = data_i8.reshape(num_groups, -1).to(dtype=torch.float32)
+
+    scales = params[:, 0].reshape(-1, 1)
+    offsets = params[:, 1].reshape(-1, 1)
+
+    if not is_symmetric_quant:
+        data_f = data_f - offsets
+    else:
+        assert offsets.allclose(torch.zeros_like(offsets))
+
+    data_f = data_f * scales
+
+    return data_f
+
+
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("num_groups", [1, 13, 512])
-@pytest.mark.parametrize("num_elems",
-                         [8,
-                          16,
-                          32,
-                          64,
-                          128,
-                          256,
-                          4096,
-                          8192,
-                          12288,
-                          16384])
+@pytest.mark.parametrize("num_elems", [8, 16, 32, 64, 128, 256, 4096, 8192, 12288, 16384])
 @pytest.mark.parametrize("is_symmetric_quant", [True, False])
 @pytest.mark.parametrize("q_bits", [4, 8])
 @pytest.mark.parametrize("directed_case", ["all_zeros", None])
-def test_float_quantize(num_elems,
-                        num_groups,
-                        is_symmetric_quant,
-                        q_bits,
-                        directed_case):
+def test_float_quantize(num_elems, num_groups, is_symmetric_quant, q_bits, directed_case):
+    # fix seed
+    torch.manual_seed(num_elems)
 
     if directed_case == "all_zeros":
-        activations_ds = torch.zeros((num_groups,
-                                      num_elems),
+        activations_ds = torch.zeros((num_groups, num_elems),
                                      dtype=torch.float16,
                                      device=get_accelerator().device_name())
     else:
-        activations_ds = torch.randn((num_groups,
-                                      num_elems),
+        activations_ds = torch.randn((num_groups, num_elems),
                                      dtype=torch.float16,
                                      device=get_accelerator().device_name())
     activations_ref = activations_ds.clone().detach()
 
     ref_out_tensor, ref_params = run_float_quantize(q_bits, is_symmetric_quant, activations_ref, num_groups)
+    ref_dequantized_tensor = run_float_dequantize(q_bits, is_symmetric_quant, ref_out_tensor, ref_params, num_groups)
+    # we need to convert the tensor to float64 to avoid overflow
+    ref_quantization_error = torch.sum(torch.abs((activations_ref - ref_dequantized_tensor).to(torch.float64)))
 
     ds_out_tensor, ds_out_params = run_quantize_ds(activations_ds, num_groups, q_bits, is_symmetric_quant)
+    ds_dequantized_tensor = run_dequantize_ds(ds_out_tensor, ds_out_params, num_groups, q_bits, is_symmetric_quant)
+    assert torch.all(torch.isfinite(ds_dequantized_tensor))
 
-    if (q_bits == 4):
-        ds_out_tensor = int4x2to2xint4(ds_out_tensor)
+    ds_quantization_error = torch.sum(torch.abs((activations_ds - ds_dequantized_tensor).to(torch.float64)))
 
-    # Allow a max difference of 1 to account for differences in rounding in pytorch implementation
-    assert (torch.all(
-        torch.lt(torch.abs(ds_out_tensor.flatten() - ref_out_tensor.flatten()),
-                 2)))
-    if is_symmetric_quant:
-        assert (torch.allclose(ds_out_params.flatten(), ref_params[:, 0].flatten()))
-    else:
-        assert (torch.allclose(ds_out_params[:,
-                                             0].flatten(),
-                               ref_params[:,
-                                          0].flatten()))
-        assert (torch.allclose(ds_out_params[:,
-                                             1].flatten(),
-                               ref_params[:,
-                                          1].flatten(),
-                               atol=5e-5,
-                               rtol=5e-5))
+    assert (ds_quantization_error <= ref_quantization_error * 1.05)
diff --git a/tests/unit/ops/sparse_attention/test_sparse_attention.py b/tests/unit/ops/sparse_attention/test_sparse_attention.py
index bab57ccdc181609b85275e37a39f4b04555f8ab0..217267a97951aa70fad4d339632b78a6615a9679 100644
--- a/tests/unit/ops/sparse_attention/test_sparse_attention.py
+++ b/tests/unit/ops/sparse_attention/test_sparse_attention.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
 # https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
@@ -10,10 +13,10 @@ import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import SparseAttnBuilder
+from unit.util import skip_on_arch, skip_on_cuda
 
 if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
-    pytest.skip("sparse attention op is not compatible on this system",
-                allow_module_level=True)
+    pytest.skip("sparse attention op is not compatible on this system", allow_module_level=True)
 
 
 def dense_to_sparse(w, mask, block):
@@ -25,7 +28,7 @@ def dense_to_sparse(w, mask, block):
     h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
     for zz in range(Z):
         for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
-            ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
+            ret[zz, idx, :, :] = w[zz, hh, ii * block:(ii + 1) * block, jj * block:(jj + 1) * block]
     return ret
 
 
@@ -95,34 +98,23 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
     if layout is None:
         layout = make_layout(rho, (H, M // block, N // block))
     if dense_x:
-        x = torch.rand((Z,
-                        H,
-                        M,
-                        N),
-                       dtype=dtype,
-                       requires_grad=True,
-                       device=get_accelerator().device_name())
+        x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
     else:
-        x = torch.rand((Z,
-                        layout.sum(),
-                        block,
-                        block),
+        x = torch.rand((Z, layout.sum(), block, block),
                        dtype=dtype,
                        requires_grad=True,
                        device=get_accelerator().device_name())
     dx = torch.rand_like(x)
     bool_attn_mask = torch.randint(low=0,
                                    high=2,
-                                   size=(N,
-                                         N),
+                                   size=(N, N),
                                    dtype=torch.bool,
                                    requires_grad=False,
                                    device=get_accelerator().device_name())
     fp_attn_mask = bool_attn_mask.type(dtype)
     kp_mask = torch.randint(low=0,
                             high=2,
-                            size=(Z,
-                                  N),
+                            size=(Z, N),
                             dtype=dtype,
                             requires_grad=False,
                             device=get_accelerator().device_name())
@@ -130,32 +122,28 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
     return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
 
 
-def _skip_on_cuda_compatability():
-    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
-        if torch.cuda.get_device_capability()[0] < 7:
-            pytest.skip("needs higher compute capability than 7")
-        cuda_major = int(torch.version.cuda.split('.')[0]) * 10
-        cuda_minor = int(torch.version.cuda.split('.')[1])
-        cuda_version = cuda_major + cuda_minor
-        if (cuda_version != 101 and cuda_version != 102) and \
-                (cuda_version != 111 and cuda_version != 110):
-            pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
-    else:
-        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
-        return
-
-
 @pytest.mark.parametrize("block", [16, 32])
 @pytest.mark.parametrize("width", [256, 576])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_softmax(block, width, dtype):
-    #_skip_on_cuda_compatability()
+    valid_cuda_versions = [101, 102, 110, 111]
+    skip_on_arch(min_arch=7)
+    skip_on_cuda(valid_cuda=valid_cuda_versions)
+
     Z = 2
     H = 4
     scale = 0.4
     rho = 0.4
     M = N = width
-    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
+    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z,
+                                                                               H,
+                                                                               M,
+                                                                               N,
+                                                                               scale,
+                                                                               rho,
+                                                                               block,
+                                                                               dtype,
+                                                                               layout=None)
     ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
     st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
 
@@ -206,20 +194,8 @@ def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype,
     BS0 = N if trans_b else K
     BS1 = K if trans_b else N
     shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
-    x = torch.rand((Z,
-                    H,
-                    AS0,
-                    AS1),
-                   dtype=dtype,
-                   requires_grad=True,
-                   device=get_accelerator().device_name())
-    w = torch.rand((Z,
-                    H,
-                    BS0,
-                    BS1),
-                   dtype=dtype,
-                   requires_grad=True,
-                   device=get_accelerator().device_name())
+    x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
+    w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device=get_accelerator().device_name())
     dy = torch.rand((Z, H, M, N), dtype=dtype, device=get_accelerator().device_name())
     if layout is None:
         layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
@@ -256,7 +232,10 @@ testdata = [
 
 @pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
 def test_matmul(block, dtype, mode, trans_a, trans_b):
-    #_skip_on_cuda_compatability()
+    valid_cuda_versions = [101, 102, 110, 111]
+    skip_on_arch(min_arch=7)
+    skip_on_cuda(valid_cuda=valid_cuda_versions)
+
     Z = 3
     H = 2
     M = 128
@@ -266,6 +245,7 @@ def test_matmul(block, dtype, mode, trans_a, trans_b):
     x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
     ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
     st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
+
     assert allclose(ref_y, st_y)
     assert allclose(ref_dx, st_dx)
     assert allclose(ref_dw, st_dw)
diff --git a/tests/unit/ops/spatial/test_nhwc_bias_add.py b/tests/unit/ops/spatial/test_nhwc_bias_add.py
index f3a31cf47ba481a4e4b1a1a24980ffabe586b8b1..f243e82f6d3b42548055802d7ca9c61469237a12 100644
--- a/tests/unit/ops/spatial/test_nhwc_bias_add.py
+++ b/tests/unit/ops/spatial/test_nhwc_bias_add.py
@@ -1,6 +1,7 @@
-'''
-Copyright 2022 The Microsoft DeepSpeed Team
-'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -18,22 +19,7 @@ def ref_bias_add(activations, bias):
     return activations + bias.reshape(1, -1, 1, 1)
 
 
-channels_list = [
-    192,
-    384,
-    320,
-    576,
-    640,
-    768,
-    960,
-    1152,
-    1280,
-    1536,
-    1600,
-    1920,
-    2240,
-    2560
-]
+channels_list = [192, 384, 320, 576, 640, 768, 960, 1152, 1280, 1536, 1600, 1920, 2240, 2560]
 
 
 @pytest.mark.inference_ops
@@ -41,16 +27,10 @@ channels_list = [
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add(activations.clone().detach(), bias)
     ds_vals = nhwc_bias_add(activations, bias)
@@ -67,23 +47,13 @@ def ref_bias_add_add(activations, bias, other):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    other = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn((batch, channels, image_size, image_size),
+                        dtype=torch.float16,
+                        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
 
     ref_vals = ref_bias_add_add(activations.clone().detach(), bias, other)
     ds_vals = nhwc_bias_add(activations, bias, other=other)
@@ -92,13 +62,7 @@ def test_bias_add_add(batch, image_size, channels):
 
 
 def ref_bias_add_bias_add(activations, bias, other, other_bias):
-    return (activations + bias.reshape(1,
-                                       -1,
-                                       1,
-                                       1)) + (other + other_bias.reshape(1,
-                                                                         -1,
-                                                                         1,
-                                                                         1))
+    return (activations + bias.reshape(1, -1, 1, 1)) + (other + other_bias.reshape(1, -1, 1, 1))
 
 
 @pytest.mark.inference_ops
@@ -106,31 +70,16 @@ def ref_bias_add_bias_add(activations, bias, other, other_bias):
 @pytest.mark.parametrize("image_size", [16, 32, 64])
 @pytest.mark.parametrize("channels", channels_list)
 def test_bias_add_bias_add(batch, image_size, channels):
-    activations = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    other = torch.randn(
-        (batch,
-         channels,
-         image_size,
-         image_size),
-        dtype=torch.float16,
-        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
-    bias = torch.randn((channels),
-                       dtype=torch.float16,
-                       device=get_accelerator().device_name())
-    other_bias = torch.randn((channels),
-                             dtype=torch.float16,
-                             device=get_accelerator().device_name())
-
-    ref_vals = ref_bias_add_bias_add(activations.clone().detach(),
-                                     bias,
-                                     other,
-                                     other_bias)
+    activations = torch.randn((batch, channels, image_size, image_size),
+                              dtype=torch.float16,
+                              device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    other = torch.randn((batch, channels, image_size, image_size),
+                        dtype=torch.float16,
+                        device=get_accelerator().device_name()).to(memory_format=torch.channels_last)
+    bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
+    other_bias = torch.randn((channels), dtype=torch.float16, device=get_accelerator().device_name())
+
+    ref_vals = ref_bias_add_bias_add(activations.clone().detach(), bias, other, other_bias)
     ds_vals = nhwc_bias_add(activations, bias, other=other, other_bias=other_bias)
 
     assert allclose(ds_vals, ref_vals)
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index f3475a14766d01c2a94576a817d0c65b94970e70..36a01f2be8e760043e6883e6519333c4651603f9 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -7,8 +10,7 @@ from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
@@ -40,14 +42,8 @@ def run_bias_add_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=["fp16", "fp32"])
 def test_bias_add(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index c70974e51d94a9cb22a705b90f2df8d0f8b9434e..477c0a3bc7c74ac3fe0e814738dbe19c534aefa2 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,8 +10,7 @@ from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
@@ -27,8 +27,7 @@ def run_bias_geglu_reference(activations, bias):
     # Explicitly using the default GeLU
     activations = activations + bias.reshape(1, 1, -1)
     hidden_states, gate = activations.chunk(2, dim=-1)
-    return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(
-        activations.dtype)
+    return hidden_states * torch.nn.functional.gelu(gate.to(torch.float32)).to(activations.dtype)
 
 
 def run_bias_geglu_ds(activation, bias):
@@ -44,14 +43,8 @@ def run_bias_geglu_ds(activation, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_bias_geglu(batch, sequence, channels, dtype):
-    activation = torch.randn((batch,
-                              sequence,
-                              channels * 2),
-                             dtype=dtype,
-                             device=get_accelerator().device_name())
-    bias = torch.randn((channels * 2),
-                       dtype=dtype,
-                       device=get_accelerator().device_name())
+    activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
+    bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
 
     ds_out = run_bias_geglu_ds(activation, bias)
     ref_out = run_bias_geglu_reference(activation, bias)
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index 3c1762179ead362e7acc05f83682525e2f85e4e5..1c5e7d58f85a852874eeb794fdd1ed570a86cb2d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -10,8 +11,7 @@ from deepspeed.ops.op_builder import InferenceBuilder
 from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
@@ -25,8 +25,7 @@ def allclose(x, y):
 
 def run_bias_gelu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
-    return torch.nn.functional.gelu(activations.to(torch.float32) +
-                                    bias.to(torch.float32),
+    return torch.nn.functional.gelu(activations.to(torch.float32) + bias.to(torch.float32),
                                     approximate='tanh').to(activations.dtype)
 
 
@@ -49,14 +48,8 @@ def test_bias_gelu(batch, sequence, channels, dtype):
     if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index e2b66f6bd2e18da03c2512933bb2d08ee9daf9ad..50daa221f4cc03aaf3d8c6dbe25785a964bffc21 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,8 +10,7 @@ from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 torch_minor_version = None
@@ -24,8 +24,7 @@ def allclose(x, y):
 
 def run_bias_relu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
-    return torch.nn.functional.relu(
-        activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+    return torch.nn.functional.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
 
 
 def run_bias_relu_ds(activations, bias):
@@ -44,14 +43,8 @@ def run_bias_relu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_bias_relu(batch, sequence, channels, dtype):
-    activations_ds = torch.randn((batch,
-                                  sequence,
-                                  channels),
-                                 dtype=dtype,
-                                 device=get_accelerator().device_name())
-    bias_ds = torch.randn((channels),
-                          dtype=dtype,
-                          device=get_accelerator().device_name())
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
+    bias_ds = torch.randn((channels), dtype=dtype, device=get_accelerator().device_name())
 
     activations_ref = activations_ds.clone().detach()
     bias_ref = bias_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 61f6455629e6b598e646b3f1fe908d1c34ccb8b8..c765fd86744d1c6f7af3ce807bcfce159901aa16 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import deepspeed
 import torch
@@ -9,8 +10,7 @@ from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 
@@ -25,11 +25,7 @@ def ref_implementation(vals, gamma, beta, espilon, channels, dtype):
     vals_f = vals.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
-    return torch.nn.functional.layer_norm(vals_f,
-                                          (channels,
-                                           ),
-                                          weight=gamma_f,
-                                          bias=beta_f).to(dtype)
+    return torch.nn.functional.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f).to(dtype)
 
 
 def ds_implementation(vals, gamma, beta, epsilon):
@@ -45,17 +41,9 @@ def ds_implementation(vals, gamma, beta, epsilon):
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_layer_norm(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     ref_output = ref_implementation(vals, gamma, beta, epsilon, channels, dtype)
@@ -70,11 +58,7 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, espilon, channels,
     res_f = res.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
-    return torch.nn.functional.layer_norm(vals_f + bias_f + res_f,
-                                          (channels,
-                                           ),
-                                          weight=gamma_f,
-                                          bias=beta_f).to(dtype)
+    return torch.nn.functional.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f).to(dtype)
 
 
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -90,59 +74,27 @@ def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_layer_norm_residual(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    residual = torch.randn((batch,
-                            seq_len,
-                            channels),
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
-    bias = torch.randn((channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     new_output = residual_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
-    ref_output = residual_ref_implementation(vals,
-                                             bias,
-                                             residual,
-                                             gamma,
-                                             beta,
-                                             epsilon,
-                                             channels,
-                                             dtype)
+    ref_output = residual_ref_implementation(vals, bias, residual, gamma, beta, epsilon, channels, dtype)
 
     assert allclose(new_output, ref_output)
 
 
-def residual_store_ref_implementation(vals,
-                                      bias,
-                                      res,
-                                      gamma,
-                                      beta,
-                                      espilon,
-                                      channels,
-                                      dtype):
+def residual_store_ref_implementation(vals, bias, res, gamma, beta, espilon, channels, dtype):
     vals_f = vals.to(torch.float32)
     bias_f = bias.to(torch.float32).reshape(1, 1, -1)
     res_f = res.to(torch.float32)
     gamma_f = gamma.to(torch.float32)
     beta_f = beta.to(torch.float32)
     res_output = vals_f + bias_f + res_f
-    norm_output = torch.nn.functional.layer_norm(res_output,
-                                                 (channels,
-                                                  ),
-                                                 weight=gamma_f,
-                                                 bias=beta_f).to(dtype)
+    norm_output = torch.nn.functional.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f).to(dtype)
     return norm_output, res_output.to(dtype)
 
 
@@ -150,13 +102,7 @@ def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
     global inference_module
     if inference_module is None:
         inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(
-        vals,
-        bias,
-        res,
-        gamma,
-        beta,
-        epsilon)
+    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -165,36 +111,16 @@ def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
 @pytest.mark.parametrize("channels", [384, 512, 768, 1024, 2048, 8192, 14432])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
-    vals = torch.randn((batch,
-                        seq_len,
-                        channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    residual = torch.randn((batch,
-                            seq_len,
-                            channels),
-                           dtype=dtype,
-                           device=get_accelerator().current_device_name())
-    bias = torch.randn((channels),
-                       dtype=dtype,
-                       device=get_accelerator().current_device_name())
-    gamma = torch.randn((channels),
-                        dtype=dtype,
-                        device=get_accelerator().current_device_name())
-    beta = torch.rand((channels),
-                      dtype=dtype,
-                      device=get_accelerator().current_device_name())
+    vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
+    bias = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
+    beta = torch.rand((channels), dtype=dtype, device=get_accelerator().current_device_name())
     epsilon = 1e-5
 
     # Need to run the reference first since there's an in-place component to ours
-    ref_norm_output, norm_res_output = residual_store_ref_implementation(vals,
-                                        bias,
-                                        residual,
-                                        gamma,
-                                        beta,
-                                        epsilon,
-                                        channels,
-                                        dtype)
+    ref_norm_output, norm_res_output = residual_store_ref_implementation(vals, bias, residual, gamma, beta, epsilon,
+                                                                         channels, dtype)
 
     ds_norm_output, ds_res_output = residual_store_ds_implementation(vals, bias, residual, gamma, beta, epsilon)
 
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index fdd6e8607c71e320577bef8ea3e215a97d6d68c7..79313bd68bdbff15b082fb35c8eac4bacaa840f1 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,8 +10,7 @@ from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
 
@@ -38,26 +38,10 @@ def run_moe_res_matmul_ds(residual, coef, output):
 @pytest.mark.parametrize("c", [1, 4])
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16])
 def test_moe_residual_matmul(hidden_dim, c, dtype):
-    residual_ds = torch.randn((c,
-                               hidden_dim * c,
-                               hidden_dim),
-                              dtype=dtype,
-                              device=get_accelerator().device_name())
-    coeff1 = torch.randn((1,
-                          1,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    coeff2 = torch.randn((1,
-                          1,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    out_ds = torch.randn((c,
-                          hidden_dim * c,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
+    residual_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    coeff1 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    coeff2 = torch.randn((1, 1, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    out_ds = torch.randn((c, hidden_dim * c, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     coeff_ds = torch.cat((coeff1, coeff2), dim=-1)
     residual_ref = residual_ds.clone().detach()
     coeff_ref = coeff_ds.clone().detach()
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index 0dacee355369cf66a0738a19fc28e4e644ea194e..f5571d33b7bc2c2a20199599ea5db3a905220745 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -1,6 +1,7 @@
-"""
-Copyright 2022 The Microsoft DeepSpeed Team
-"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,8 +10,7 @@ from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
-    pytest.skip("Inference ops are not available on this system",
-                allow_module_level=True)
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 
 def allclose(x, y):
@@ -24,13 +24,7 @@ def inference_module():
     return InferenceBuilder().load()
 
 
-def res_add_bias_ref(hidden_state,
-                     residual,
-                     attn_output,
-                     attn_bias,
-                     final_bias,
-                     mp_size=1,
-                     pre_attn_norm=True):
+def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True):
     if pre_attn_norm:
         hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
     else:
@@ -38,43 +32,19 @@ def res_add_bias_ref(hidden_state,
     return hidden_state
 
 
-def res_add_bias_ref_gptj(hidden_state,
-                          residual,
-                          attn_output,
-                          attn_bias,
-                          final_bias,
-                          add_attn_bias,
-                          mp_size):
+def res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
     hidden_state += attn_output + (residual + final_bias) / mp_size
     if add_attn_bias:
         hidden_state += attn_bias / mp_size
     return hidden_state
 
 
-def run_residual_add_reference(hidden_state,
-                               residual,
-                               attn_output,
-                               attn_bias,
-                               final_bias,
-                               mlp_after_attn,
-                               add_attn_bias,
-                               mp_size,
-                               pre_attn_norm):
+def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, final_bias, mlp_after_attn,
+                               add_attn_bias, mp_size, pre_attn_norm):
     if mlp_after_attn:
-        return res_add_bias_ref(hidden_state,
-                                residual,
-                                attn_output,
-                                attn_bias,
-                                final_bias,
-                                mp_size,
-                                pre_attn_norm)
+        return res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size, pre_attn_norm)
     else:
-        return res_add_bias_ref_gptj(hidden_state,
-                                     residual,
-                                     attn_output,
-                                     attn_bias,
-                                     final_bias,
-                                     add_attn_bias,
+        return res_add_bias_ref_gptj(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias,
                                      mp_size)
 
 
@@ -87,58 +57,20 @@ def run_residual_add_reference(hidden_state,
 @pytest.mark.parametrize("add_bias", [True, False])
 @pytest.mark.parametrize("mp_size", [1, 2])
 @pytest.mark.parametrize("pre_attn_norm", [True, False])
-def test_residual_add(inference_module,
-                      batch,
-                      sequence,
-                      hidden_dim,
-                      dtype,
-                      mlp_after_attn,
-                      add_bias,
-                      mp_size,
+def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
                       pre_attn_norm):
-    ds_out = torch.randn((batch,
-                          sequence,
-                          hidden_dim),
-                         dtype=dtype,
-                         device=get_accelerator().device_name())
-    residual = torch.randn((batch,
-                            sequence,
-                            hidden_dim),
-                           dtype=dtype,
-                           device=get_accelerator().device_name())
-    attn_output = torch.randn((batch,
-                               sequence,
-                               hidden_dim),
-                              dtype=dtype,
-                              device=get_accelerator().device_name())
-    final_bias = torch.randn((hidden_dim),
-                             dtype=dtype,
-                             device=get_accelerator().device_name())
-    attn_bias = torch.randn((hidden_dim),
-                            dtype=dtype,
-                            device=get_accelerator().device_name())
+    ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    final_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name())
+    attn_bias = torch.randn((hidden_dim), dtype=dtype, device=get_accelerator().device_name())
 
     ref_out = ds_out.clone()
-    ref_out = run_residual_add_reference(ref_out,
-                                         residual,
-                                         attn_output,
-                                         attn_bias,
-                                         final_bias,
-                                         mlp_after_attn,
-                                         add_bias,
-                                         mp_size,
-                                         pre_attn_norm)
+    ref_out = run_residual_add_reference(ref_out, residual, attn_output, attn_bias, final_bias, mlp_after_attn,
+                                         add_bias, mp_size, pre_attn_norm)
 
     res_add_args = [
-        ds_out,
-        residual,
-        attn_output,
-        attn_bias,
-        final_bias,
-        mp_size,
-        mlp_after_attn,
-        add_bias,
-        pre_attn_norm
+        ds_out, residual, attn_output, attn_bias, final_bias, mp_size, mlp_after_attn, add_bias, pre_attn_norm
     ]
 
     if dtype == torch.float16:
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index e8404b0d5a17d229684f3e17acc01e4cb79e81a9..05c6a82ef55a21603d348f85af11e28907c94493 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -22,10 +25,8 @@ LAYERS = 8
 @pytest.fixture
 def sequential_model():
     model = torch.nn.Sequential(
-        *[nn.Linear(HIDDEN_DIM,
-                    HIDDEN_DIM) for _ in range(LAYERS)],
-        nn.Linear(HIDDEN_DIM,
-                  1),
+        *[nn.Linear(HIDDEN_DIM, HIDDEN_DIM) for _ in range(LAYERS)],
+        nn.Linear(HIDDEN_DIM, 1),
     )
     return model
 
@@ -33,15 +34,14 @@ def sequential_model():
 @pytest.fixture
 def simple_config():
     config_dict = {
-        "train_batch_size": 1,
+        "train_batch_size": 2,
         "train_micro_batch_size_per_gpu": 1,
         "steps_per_print": 1,
         "optimizer": {
             "type": "Adam",
             "params": {
                 "lr": 0.001,
-                "betas": [0.9,
-                          0.999],
+                "betas": [0.9, 0.999],
                 "eps": 1e-8,
                 "weight_decay": 3e-7
             }
@@ -61,7 +61,8 @@ def batch_input():
 class TestPipeModuleSequential(DistributedTest):
     world_size = 2
 
-    def test(self, sequential_model, simple_config, batch_input):
+    @pytest.mark.parametrize("activation_checkpoints", [False, True])
+    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints):
         base_model = copy.deepcopy(sequential_model)
         base_input = batch_input.clone().detach()
         base_output = base_model(base_input)
@@ -73,16 +74,21 @@ class TestPipeModuleSequential(DistributedTest):
 
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
-        total_pipe_params = torch.LongTensor([my_params
-                                              ]).to(get_accelerator().device_name())
+        total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name())
         dist.all_reduce(total_pipe_params)
         total_pipe_params = total_pipe_params.item()
         assert total_pipe_params == base_params
 
-        pipe_model, _, _, _ = deepspeed.initialize(
-            config=simple_config,
-            model=pipe_model,
-            model_parameters=[p for p in pipe_model.parameters()])
+        pipe_model, _, _, _ = deepspeed.initialize(config=simple_config,
+                                                   model=pipe_model,
+                                                   model_parameters=[p for p in pipe_model.parameters()])
+
+        if activation_checkpoints:
+            deepspeed.checkpointing.configure(None,
+                                              deepspeed_config=pipe_model.config,
+                                              partition_activations=True,
+                                              contiguous_checkpointing=True,
+                                              num_checkpoints=9)
 
         if pipe_model.is_first_stage or pipe_model.is_last_stage:
             pipe_input = base_input.clone().detach().to(get_accelerator().device_name())
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
index 1f93533587c0574b0ee9df690796c57660f05f44..04a63195f5a4ed4c33031f8a0261efc6cffd62d8 100644
--- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -6,11 +9,9 @@ import deepspeed
 from deepspeed.profiling.flops_profiler import get_model_profile
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
+from unit.util import required_minimum_torch_version
 
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1
-                                or (TORCH_MAJOR == 1 and TORCH_MINOR < 3),
+pytestmark = pytest.mark.skipif(not required_minimum_torch_version(major_version=1, minor_version=3),
                                 reason='requires Pytorch version 1.3 or above')
 
 
@@ -22,35 +23,25 @@ TOLERANCE = 0.05
 
 
 class LeNet5(torch.nn.Module):
+
     def __init__(self, n_classes):
         super(LeNet5, self).__init__()
 
         self.feature_extractor = torch.nn.Sequential(
-            torch.nn.Conv2d(in_channels=1,
-                            out_channels=6,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
             torch.nn.Tanh(),
             torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=6,
-                            out_channels=16,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
             torch.nn.Tanh(),
             torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=16,
-                            out_channels=120,
-                            kernel_size=5,
-                            stride=1),
+            torch.nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1),
             torch.nn.Tanh(),
         )
 
         self.classifier = torch.nn.Sequential(
-            torch.nn.Linear(in_features=120,
-                            out_features=84),
+            torch.nn.Linear(in_features=120, out_features=84),
             torch.nn.Tanh(),
-            torch.nn.Linear(in_features=84,
-                            out_features=n_classes),
+            torch.nn.Linear(in_features=84, out_features=n_classes),
         )
 
     def forward(self, x):
@@ -90,9 +81,7 @@ class TestFlopsProfiler(DistributedTest):
         hidden_dim = 10
         model = SimpleModel(hidden_dim, empty_grad=False)
 
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
diff --git a/tests/unit/run_test.sh b/tests/unit/run_test.sh
deleted file mode 100644
index a2ae7604f393dfa71f2d5db1f9318fa3e46f707f..0000000000000000000000000000000000000000
--- a/tests/unit/run_test.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-1]
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-2]
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[9-True-3]
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-1]
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-2]
-#pytest runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[10-True-3]
-#pytest runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-1]
-#pytest runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-2]
-
-pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-True]
-pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[3-True]
-pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-True]
-pytest checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[3-True]
-pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[2-True-deepspeed_adam]
-pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[3-True-deepspeed_adam]
-pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-True-deepspeed_adam]
-pytest checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[3-True-deepspeed_adam]
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
index af354fe1caa6a77c95ad4f97fb6822f752a4b741..0232457a4f9c1201de3a74fb5d83fae63203a580 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # TODO: add tests with model parallelism for activation partitioning and other features.
 
@@ -106,6 +109,7 @@ def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
 
 
 class MaskedLinear(torch.nn.Linear):
+
     def forward(self, x, mask):
         out = super().forward(x)
         if mask.is_floating_point():
@@ -118,12 +122,14 @@ class MaskedLinear(torch.nn.Linear):
 
 class MaskedLinearSeq(MaskedLinear):
     """Tests pipeline modules by also returning the mask."""
+
     def forward(self, x, mask):
         return super().forward(x, mask), mask
 
 
 class MaskedLinearSeqDup(MaskedLinearSeq):
     """MaskedLinearSeq, but with more outputs than inputs and in a different order."""
+
     def forward(self, x, mask):
         dup = x.clone().detach() * 1.38  # just an arbitrary scaling
         x, mask = super().forward(x, mask)
@@ -131,16 +137,19 @@ class MaskedLinearSeqDup(MaskedLinearSeq):
 
 
 class DropMaskLinear(torch.nn.Linear):
+
     def forward(self, x, mask):
         return super().forward(x)
 
 
 class LinearNonTensorInput(torch.nn.Linear):
+
     def forward(self, x, non_tensor_input):
         return super().forward(x)
 
 
 class LinearNonTensorOutput(torch.nn.Linear):
+
     def __init__(self, non_tensor_output):
         super().__init__(HIDDEN_DIM, HIDDEN_DIM)
         self.non_tensor_output = non_tensor_output
@@ -173,11 +182,10 @@ def _bool_to_float(btensor, dtype=torch.float32):
 
 
 # both bool and float are important, as bool is not differentiable
-@pytest.mark.parametrize('mask',
-                         [
-                             _mixed_mask(),
-                             _bool_to_float(_mixed_mask()),
-                         ])
+@pytest.mark.parametrize('mask', [
+    _mixed_mask(),
+    _bool_to_float(_mixed_mask()),
+])
 class TestActivationCheckpoint(DistributedTest):
     world_size = 1
 
@@ -212,16 +220,7 @@ class TestActivationCheckpoint(DistributedTest):
         _test_activation_checkpoint(module, *inputs)
 
 
-@pytest.mark.parametrize(
-    'non_tensor',
-    [None,
-     2,
-     True,
-     (None,
-      2.5),
-     (None,
-      True,
-      torch.randn(HIDDEN_DIM))])
+@pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))])
 class TestCheckpointNonTensor(DistributedTest):
     world_size = 1
 
@@ -238,18 +237,9 @@ class TestCheckpointNonTensor(DistributedTest):
         _test_activation_checkpoint(module, inputs)
 
 
-@pytest.mark.parametrize('non_tensor_output',
-                         [
-                             None,
-                             (torch.randn(HIDDEN_DIM),
-                              2.5),
-                             (None,
-                              torch.randn(HIDDEN_DIM),
-                              True),
-                             (None,
-                              True,
-                              torch.randn(HIDDEN_DIM))
-                         ])
+@pytest.mark.parametrize('non_tensor_output', [
+    None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM))
+])
 class TestCheckpointNonTensorOutputOrdering(DistributedTest):
     world_size = 1
 
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index fa1041379a6b8f1d70c0b5b3be52f0c0a0fa88b2..8e736c1eaaa6a8382cce770b1e413033da9a3c87 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -1,5 +1,10 @@
-'''Copyright The Microsoft DeepSpeed Team'''
-"""unit tests for coalesced collectives"""
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+unit tests for coalesced collectives
+"""
 
 import torch
 import deepspeed.comm as dist
@@ -13,11 +18,7 @@ class TestReduceScatterCoalesced(DistributedTest):
     world_size = 2
 
     def test_single_input(self):
-        input = torch.full((6,
-                            ),
-                           dist.get_rank(),
-                           dtype=torch.half,
-                           device=get_accelerator().current_device_name())
+        input = torch.full((6, ), dist.get_rank(), dtype=torch.half, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
@@ -25,17 +26,10 @@ class TestReduceScatterCoalesced(DistributedTest):
         assert torch.allclose(output, torch.full_like(output, 0.5))
 
     def test_two_inputs(self):
-        tensor_kwargs = {
-            "device": get_accelerator().current_device_name(),
-            "dtype": torch.half
-        }
+        tensor_kwargs = {"device": get_accelerator().current_device_name(), "dtype": torch.half}
         inputs = [
-            dist.get_rank() * torch.arange(0,
-                                           6,
-                                           **tensor_kwargs),
-            dist.get_rank() * torch.arange(6,
-                                           9,
-                                           **tensor_kwargs),
+            dist.get_rank() * torch.arange(0, 6, **tensor_kwargs),
+            dist.get_rank() * torch.arange(6, 9, **tensor_kwargs),
         ]
 
         output1, output2 = reduce_scatter_coalesced(inputs, dist.get_world_group())
@@ -56,10 +50,7 @@ class TestReduceScatterCoalescedTensorSmallerThanWorldSize(DistributedTest):
     world_size = 2
 
     def test(self):
-        input = torch.zeros((1,
-                             ),
-                            dtype=torch.half,
-                            device=get_accelerator().current_device_name())
+        input = torch.zeros((1, ), dtype=torch.half, device=get_accelerator().current_device_name())
 
         (output, ) = reduce_scatter_coalesced([input], dist.get_world_group())
 
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index 84a36768174ae9dc7825098e91014da01528cd4b..d3b0a90e2fa5a35d8aaec11acbed08566efdd0ab 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import torch.nn as nn
@@ -15,13 +18,12 @@ from deepspeed.runtime.pipe.module import PipelineModule
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.alexnet_model import AlexNetPipe, train_cifar
+from unit.util import required_minimum_torch_version
 from deepspeed.accelerator import get_accelerator
 
 PipeTopo = PipeDataParallelTopology
 
-TORCH_MAJOR = int(torch.__version__.split(".")[0])
-TORCH_MINOR = int(torch.__version__.split(".")[1])
-if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
+if not required_minimum_torch_version(major_version=1, minor_version=8):
     pytest.skip(
         "NCCL-based 1-bit compression requires torch 1.8 or higher",
         allow_module_level=True,
@@ -29,9 +31,8 @@ if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version[0] > 4:
-    pytest.skip(
-        "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
-        allow_module_level=True)
+    pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
+                allow_module_level=True)
 
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
@@ -62,9 +63,7 @@ class TestOneBitAdamBasic(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -127,10 +126,7 @@ class TestOneBitAdamExpAvgMask(DistributedTest):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -234,14 +230,12 @@ class TestOneBitAdamCheckpointing(DistributedTest):
         # Test whether momentum mask still exist after saving checkpoint
         assert optimizer_1.optimizer.adam_freeze_key is True
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -250,18 +244,16 @@ class TestOneBitAdamCheckpointing(DistributedTest):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_2.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -286,18 +278,15 @@ class TestOneBitAdamCheckpointing(DistributedTest):
             model_3.step()
         assert optimizer_3.optimizer.adam_freeze_key is True
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_3.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -328,13 +317,8 @@ class TestOneBitAdamCheckpointing(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -376,8 +360,7 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
                 "type": "OneBitAdam",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
@@ -407,9 +390,7 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
         init_net = AlexNetPipe()
 
         test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
 
         test_losses = train_cifar(
             test_model,
@@ -450,9 +431,7 @@ class TestZeroOneAdamBasic(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -518,10 +497,7 @@ class TestZeroOneAdamExpAvgMask(DistributedTest):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -627,14 +603,12 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
             model_1.step()
         # Test whether momentum mask still exist after saving checkpoint
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -643,18 +617,16 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_2.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -677,18 +649,15 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
             model_3.backward(loss)
             model_3.step()
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         for v in optimizer_3.state.values():
             assert "worker_error" not in v, f"Incorrect worker error"
@@ -721,13 +690,8 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -769,8 +733,7 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
                 "type": "ZeroOneAdam",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "var_freeze_step": 4,
@@ -803,9 +766,7 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
         init_net = AlexNetPipe()
 
         test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
 
         test_losses = train_cifar(
             test_model,
@@ -849,9 +810,7 @@ class TestOneBitLambBasic(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(
             model=model,
             total_samples=50,
@@ -919,10 +878,7 @@ class TestOneBitLampExpAvgMask(DistributedTest):
             model=model,
             model_parameters=optimizer_grouped_parameters,
         )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -1030,18 +986,16 @@ class TestOneBitLambCheckpointing(DistributedTest):
         # Test whether momentum mask still exist after saving checkpoint
         assert optimizer_1.optimizer.lamb_freeze_key is True
         mask1 = mask1.to(device=optimizer_1.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Incorrect momentum mask"
         scaling_coeff_1 = []
         for v in optimizer_1.state.values():
             assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
             scaling_coeff_1.append(v["scaling_coeff"])
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(
-            optimizer_1.param_groups[0]["exp_avg_mask"], mask1, atol=1e-07
-        ), f"Momentum mask should not change after saving checkpoint"
+        assert torch.allclose(optimizer_1.param_groups[0]["exp_avg_mask"], mask1,
+                              atol=1e-07), f"Momentum mask should not change after saving checkpoint"
 
         model_2, optimizer_2, _, _ = deepspeed.initialize(
             config=config_dict,
@@ -1050,18 +1004,16 @@ class TestOneBitLambCheckpointing(DistributedTest):
         )
         # Test whether momentum mask stays the same after loading checkpoint
         mask2 = mask2.to(device=optimizer_2.param_groups[0]["exp_avg_mask"].device)
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Incorrect momentum mask"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Incorrect momentum mask"
         model_2.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert torch.allclose(
-            optimizer_2.param_groups[0]["exp_avg_mask"], mask2, atol=1e-07
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert torch.allclose(optimizer_2.param_groups[0]["exp_avg_mask"], mask2,
+                              atol=1e-07), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
@@ -1070,9 +1022,7 @@ class TestOneBitLambCheckpointing(DistributedTest):
         for v in optimizer_2.state.values():
             assert "scaling_coeff" in v, f"Incorrect scaling_coeff"
             scaling_coeff_2.append(v["scaling_coeff"])
-        assert list(sorted(scaling_coeff_2)) == list(
-            sorted(scaling_coeff_1)
-        ), f"Incorrect scaling_coeffs"
+        assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
         assert optimizer_2.optimizer.lamb_freeze_key is True
 
         model_3, optimizer_3, _, _ = deepspeed.initialize(
@@ -1093,18 +1043,15 @@ class TestOneBitLambCheckpointing(DistributedTest):
             model_3.step()
         assert optimizer_3.optimizer.lamb_freeze_key is True
         # Test whether momentum mask stays the same after loading checkpoint
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Incorrect momentum mask"
+        assert ("exp_avg_mask" not in optimizer_3.param_groups[0]), f"Incorrect momentum mask"
         model_3.load_checkpoint(
             save_folder,
             tag=None,
             load_optimizer_states=True,
             load_lr_scheduler_states=True,
         )
-        assert (
-            "exp_avg_mask" not in optimizer_3.param_groups[0]
-        ), f"Momentum mask should not change after loading checkpoint"
+        assert ("exp_avg_mask"
+                not in optimizer_3.param_groups[0]), f"Momentum mask should not change after loading checkpoint"
         # Test whether worker&server error is reset
         assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
         assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
@@ -1145,13 +1092,8 @@ class TestOneBitLambCheckpointing(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(
-            config=config_dict, model=model, model_parameters=model.parameters()
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device)
         save_folder = os.path.join(tmpdir, "saved_checkpoint")
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1193,8 +1135,7 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
                 "type": "OneBitLamb",
                 "params": {
                     "lr": 0.00001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7,
                     "freeze_step": 200,
@@ -1224,9 +1165,7 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
         init_net = AlexNetPipe()
 
         test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
 
         test_losses = train_cifar(
             test_model,
@@ -1258,15 +1197,11 @@ class TestCompressedAllReduceBasic(DistributedTest):
             worker_error = a - a_compressed
             dist.all_reduce(a_compressed)
             a_compressed.mul_(1 / dist.get_world_size())
-            a_server_sign = (
-                a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+            a_server_sign = (a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
             a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-            server_scale = [
-                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
-            ]
+            server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
             a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-            a_server_compressed = torch.cat(
-                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+            a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
             rank = dist.get_rank()
             server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
             get_accelerator().synchronize()
diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py
index 3bc5cb138c9b9ab2d8a9df132349b880c06738d5..740fa30641a1b00f7c174c852c3561ed178ac048 100644
--- a/tests/unit/runtime/half_precision/test_bf16.py
+++ b/tests/unit/runtime/half_precision/test_bf16.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -59,9 +62,7 @@ class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest):
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -154,9 +155,7 @@ class TestZeroEmptyPartition(DistributedTest):
 
         # Ensure model has 2 parameters, to cause empty partition with DP=3
         assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Now make sure things work..
         data_loader = random_dataloader(model=model,
@@ -197,9 +196,7 @@ class TestZeroSupportedClientOptimizer(DistributedTest):
 
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=client_optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
 
 
 class TestZero2ReduceScatterOff(DistributedTest):
@@ -239,9 +236,7 @@ class TestZero2ReduceScatterOff(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -279,9 +274,7 @@ class TestZeroEmptyGrad(DistributedTest):
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -293,18 +286,8 @@ class TestZeroEmptyGrad(DistributedTest):
             model.step()
 
 
-@pytest.mark.parametrize("comp_type",
-                         [torch.float16,
-                          torch.bfloat16,
-                          torch.float],
-                         ids=["fp16",
-                              "bfp16",
-                              "fp32"])
-@pytest.mark.parametrize("comm_type",
-                         [torch.float16,
-                          torch.bfloat16],
-                         ids=["fp16",
-                              "bfp16"])
+@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"])
+@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bfp16", "default"])
 class TestZeroDtypeCocktail(DistributedTest):
     world_size = 2
 
@@ -329,15 +312,16 @@ class TestZeroDtypeCocktail(DistributedTest):
             "zero_optimization": {
                 "stage": 2
             },
-            "communication_data_type": type_str[comm_type]
         }
+        if comm_type is not None:
+            config_dict["communication_data_type"] = type_str[comm_type]
+        else:
+            comm_type = comp_type
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
         data_loader = random_dataloader(model=model,
                                         total_samples=2,
                                         hidden_dim=hidden_dim,
diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
index 3052c4ee117acad6f465e4afa17b10902d454f7a..2a58fd6b4a573c83a1cdad79a0e0abfd59e9bb80 100644
--- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
+++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -37,9 +40,7 @@ class TestFused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
@@ -74,9 +75,7 @@ class TestFused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**4
         # Ensure the dynamic loss scaler is correctly configured.
@@ -109,9 +108,7 @@ class TestFused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
@@ -168,9 +165,7 @@ class TestUnfused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         expected_loss_scale = 2**8
         expected_scale_window = 2
         # Ensure the dynamic loss scaler is correctly configured.
@@ -205,9 +200,7 @@ class TestUnfused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**4
         expected_min_loss_scale = 0.25
@@ -242,9 +235,7 @@ class TestUnfused(DistributedTest):
         }
         hidden_dim = 1
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         expected_loss_scale = 2**8
         expected_scale_window = 2
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index c3c933fca144ced19bd0d9bff6dc032b4a22183d..6d88af00078ae0794dae5d87e453da7df975bdb1 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed.comm as dist
@@ -16,8 +19,7 @@ try:
     _amp_available = True
 except ImportError:
     _amp_available = False
-amp_available = pytest.mark.skipif(not _amp_available,
-                                   reason="apex/amp is not installed")
+amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed")
 
 
 class TestLambFP32GradClip(DistributedTest):
@@ -38,9 +40,7 @@ class TestLambFP32GradClip(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -73,13 +73,8 @@ class TestLambFP16(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -103,13 +98,8 @@ class TestLambFP16(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -137,9 +127,7 @@ class TestAdamFP32EmptyGrad(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=True)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
                                         total_samples=50,
                                         hidden_dim=hidden_dim,
@@ -155,24 +143,13 @@ class TestAdamwFP16Basic(DistributedTest):
     world_size = 1
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 1,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -186,20 +163,12 @@ class TestFP16OptimizerForMoE(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -208,16 +177,11 @@ class TestFP16OptimizerForMoE(DistributedTest):
         model = SimpleMoEModel(hidden_dim, ep_size=2)
         optimizer = torch.optim.AdamW(params=model.parameters())
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+                                                       model=model,
+                                                       optimizer=optimizer,
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
@@ -227,20 +191,12 @@ class TestFP16OptimizerForMoE(DistributedTest):
         if not required_torch_version():
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -250,16 +206,11 @@ class TestFP16OptimizerForMoE(DistributedTest):
         # optimizer = torch.optim.AdamW(params=model.parameters())
         optimizer = FusedAdam(params=model.parameters())
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+                                                       model=model,
+                                                       optimizer=optimizer,
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
@@ -287,9 +238,7 @@ class TestFP16OptimizerForMoE(DistributedTest):
 
         def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
             torch_norm_tensor = get_accelerator().FloatTensor([total_norm])
-            all_gather_results = [
-                torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-            ]
+            all_gather_results = [torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())]
             dist.all_gather(all_gather_results, torch_norm_tensor)
             assert len(set([x.item() for x in all_gather_results])) == 1
             return 1.0
@@ -297,17 +246,12 @@ class TestFP16OptimizerForMoE(DistributedTest):
         # initialize MoE
         model = SimpleMoEModel(hidden_dim, ep_size=2)
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                               model=model,
-                                               model_parameters=model.parameters(),
-                                               dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
+                                                       model=model,
+                                                       model_parameters=model.parameters(),
+                                                       dist_init_required=False)
+        monkeypatch.setattr(optimizer, 'unscale_and_clip_grads', mock_unscale_and_clip_grads)
         optimizer.fused_lamb_legacy = fused_lamb_legacy
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
+        data_loader = sequence_dataloader(model=engine, total_samples=50, hidden_dim=hidden_dim, device=engine.device)
         for n, batch in enumerate(data_loader):
             loss = engine(batch[0], batch[1])
             engine.backward(loss)
@@ -318,24 +262,13 @@ class TestAdamwFP16EmptyGrad(DistributedTest):
     world_size = 1
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 1,
-            "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -385,13 +318,8 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=config_dict,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -428,19 +356,14 @@ class TestZeroStaticScale(DistributedTest):
         }
 
         model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(config=config_dict,
-                                            model=model,
-                                            model_parameters=model.parameters())
+        model, optim, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Ensure the static scaler is configured.
         assert optim.dynamic_loss_scale == False
         assert optim.loss_scaler.loss_scale == 138.
 
         # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -466,7 +389,8 @@ class TestZeroAllowUntestedOptimizer(DistributedTest):
                 "stage": zero_stage,
                 "cpu_offload": use_cpu_offload
             },
-            "zero_allow_untested_optimizer": False
+            "zero_allow_untested_optimizer": False,
+            "zero_force_ds_cpu_optimizer": False
         }
         hidden_dim = 10
 
@@ -516,15 +440,10 @@ class TestZeroEmptyPartition(DistributedTest):
 
         # Ensure model has 2 parameters, to cause empty partition with DP=3
         assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -536,24 +455,13 @@ class TestAmp(DistributedTest):
     world_size = 2
 
     def test_adam_basic(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "amp": {
-                "enabled": True
-            }
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "amp": {"enabled": True}}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -577,13 +485,8 @@ class TestAmp(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -608,13 +511,8 @@ class TestAmp(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -639,13 +537,8 @@ class TestAmp(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -672,9 +565,7 @@ class TestZeroSupportedClientOptimizer(DistributedTest):
 
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=client_optimizer)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
 
 
 class TestZero2ReduceScatterOff(DistributedTest):
@@ -706,13 +597,8 @@ class TestZero2ReduceScatterOff(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -743,14 +629,9 @@ class TestFP16AdamTypes(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         for _, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -782,14 +663,9 @@ class TestZero3LazyScatter(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         for _, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -816,13 +692,8 @@ class TestZeroEmptyGrad(DistributedTest):
 
         model = SimpleModel(hidden_dim)
         optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=optimizer)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
index 2c4d3aef1f636a55c3866db1ab5a083597f3ac24..c4958b721f2c0ae5f75b4f27f82ff0fe0b52ba84 100644
--- a/tests/unit/runtime/pipe/test_pipe.py
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 import torch.nn as nn
@@ -9,6 +12,7 @@ from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule
 from unit.alexnet_model import AlexNetPipe, train_cifar
 from unit.common import DistributedTest
+from unit.util import skip_on_arch
 
 PipeTopo = PipeDataParallelTopology
 
@@ -17,25 +21,26 @@ def rel_diff(A, B):
     return abs(A - B) / abs(A)
 
 
-@pytest.mark.parametrize('topo_config',
-                         [
-                             {
-                                 "num_pp": 1,
-                                 "num_dp": 4
-                             },
-                             {
-                                 "num_pp": 2,
-                                 "num_dp": 2
-                             },
-                             {
-                                 "num_pp": 4,
-                                 "num_dp": 1
-                             },
-                         ])
+@pytest.mark.parametrize('topo_config', [
+    {
+        "num_pp": 1,
+        "num_dp": 4
+    },
+    {
+        "num_pp": 2,
+        "num_dp": 2
+    },
+    {
+        "num_pp": 4,
+        "num_dp": 1
+    },
+])
 class TestPipeCifar10(DistributedTest):
     world_size = 4
 
     def test(self, topo_config):
+        skip_on_arch(min_arch=7)
+
         config_dict = {
             "train_batch_size": 16,
             "train_micro_batch_size_per_gpu": 4,
@@ -44,8 +49,7 @@ class TestPipeCifar10(DistributedTest):
                 "type": "Adam",
                 "params": {
                     "lr": 0.001,
-                    "betas": [0.9,
-                              0.999],
+                    "betas": [0.9, 0.999],
                     "eps": 1e-8,
                     "weight_decay": 3e-7
                 }
@@ -69,38 +73,22 @@ class TestPipeCifar10(DistributedTest):
         init_net = AlexNetPipe()
 
         base_net = copy.deepcopy(init_net)
-        base_model = PipelineModule(layers=base_net.to_layers(),
-                                    num_stages=1,
-                                    loss_fn=nn.CrossEntropyLoss())
+        base_model = PipelineModule(layers=base_net.to_layers(), num_stages=1, loss_fn=nn.CrossEntropyLoss())
 
         # Train with just data parallelism
-        base_losses = train_cifar(base_model,
-                                  config=config_dict,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
+        base_losses = train_cifar(base_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
         test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
+        test_model = PipelineModule(layers=test_net.to_layers(), topology=topo, loss_fn=nn.CrossEntropyLoss())
 
-        test_losses = train_cifar(test_model,
-                                  config=config_dict,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
+        test_losses = train_cifar(test_model, config=config_dict, num_steps=steps, fp16=config_dict['fp16']['enabled'])
 
         abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
         rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
         if dist.get_rank() == 0:
-            print(
-                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
-            )
-            print(
-                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
-            )
-            print(
-                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
-            )
+            print(f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}')
+            print(f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}')
+            print(f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}')
 
             for lastX in [1, 10, 100]:
                 base_avg = sum(base_losses[-lastX:]) / lastX
@@ -114,6 +102,4 @@ class TestPipeCifar10(DistributedTest):
         base_avg = sum(base) / len(base)
         test = test_losses[-lastX:]
         test_avg = sum(test) / len(test)
-        assert rel_diff(
-            base_avg,
-            test_avg) < 0.05  # Originally 0.03, but seeing instability with AMD results
+        assert rel_diff(base_avg, test_avg) < 0.05  # Originally 0.03, but seeing instability with AMD results
diff --git a/tests/unit/runtime/pipe/test_pipe_schedule.py b/tests/unit/runtime/pipe/test_pipe_schedule.py
index 5ca3dfe1d2a0cf5b82752fd669f59349ed2dc1ad..7af7d734e4300d2028475852c2992abbac9a747a 100644
--- a/tests/unit/runtime/pipe/test_pipe_schedule.py
+++ b/tests/unit/runtime/pipe/test_pipe_schedule.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed.runtime.pipe.schedule as schedule
@@ -38,9 +41,7 @@ def test_pipe_train_schedule_singlestage():
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_firststage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=0)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=0)
     assert sched.num_micro_batches == micro_batches
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
@@ -73,9 +74,7 @@ def test_pipe_inference_schedule_firststage(micro_batches, stages=3):
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_midstage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=1)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=1)
 
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
@@ -99,9 +98,7 @@ def test_pipe_inference_schedule_midstage(micro_batches, stages=3):
 
 @pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
 def test_pipe_inference_schedule_laststage(micro_batches, stages=3):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=2)
+    sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=stages, stage_id=2)
     full = list(iter(sched))
     for idx, cmds in enumerate(full):
         if idx < sched.stage or idx > sched.stage + sched.num_micro_batches:
diff --git a/tests/unit/runtime/pipe/test_topology.py b/tests/unit/runtime/pipe/test_topology.py
index 4b0cc42d4336b29c182bcce4e0d4dbb25b80f3bc..53bc6f7a01fc04f1257890d8776b47ca2207da47 100644
--- a/tests/unit/runtime/pipe/test_topology.py
+++ b/tests/unit/runtime/pipe/test_topology.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
@@ -52,9 +55,7 @@ def test_topology_rank_repr():
     assert topo.get_rank_repr(rank=3) == 'a_01-b_01'
 
     assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01'
-    assert topo.get_rank_repr(rank=3,
-                              inner_sep='🤗',
-                              outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
+    assert topo.get_rank_repr(rank=3, inner_sep='🤗', outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
 
     topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
     assert topo.get_rank_repr(rank=0) == ''
@@ -132,26 +133,26 @@ def test_topology_comm_list():
     assert topo.get_rank(pipe=1, data=1, model=1) == 7
 
     pipe_list = [
-        [0, 4], # data=0, model=0
-        [1, 5], # data=0, model=1
-        [2, 6], # data=1, model=0
-        [3, 7], # data=1, model=1
+        [0, 4],  # data=0, model=0
+        [1, 5],  # data=0, model=1
+        [2, 6],  # data=1, model=0
+        [3, 7],  # data=1, model=1
     ]
     assert topo.get_axis_comm_lists('pipe') == pipe_list
 
     data_list = [
-        [0, 2], # pipe=0, model=0
-        [1, 3], # pipe=0, model=1
-        [4, 6], # pipe=1, model=0
-        [5, 7], # pipe=1, model=1
+        [0, 2],  # pipe=0, model=0
+        [1, 3],  # pipe=0, model=1
+        [4, 6],  # pipe=1, model=0
+        [5, 7],  # pipe=1, model=1
     ]
     assert topo.get_axis_comm_lists('data') == data_list
 
     model_list = [
-        [0, 1], # pipe=0, data=0
-        [2, 3], # pipe=0, data=1
-        [4, 5], # pipe=1, data=0
-        [6, 7], # pipe=1, data=1
+        [0, 1],  # pipe=0, data=0
+        [2, 3],  # pipe=0, data=1
+        [4, 5],  # pipe=1, data=0
+        [6, 7],  # pipe=1, data=1
     ]
     assert topo.get_axis_comm_lists('model') == model_list
 
@@ -172,8 +173,7 @@ class TestDistributedTopology(DistributedTest):
         rank = dist.get_rank()
 
         assert grid.is_first_stage == (grid.get_stage_id() == 0)
-        assert grid.is_last_stage == (
-            grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
+        assert grid.is_last_stage == (grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
 
         # Test collectives along the pipeline parallel process groups
         rank_tensor = torch.LongTensor(data=[rank]).to(get_accelerator().device_name())
@@ -209,6 +209,7 @@ class TestDistributedTopology(DistributedTest):
 
 def test_primes():
     """ Test prime factorizations. """
+
     def _product(ps):
         p = 1
         for num in ps:
diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
index 638a17bad2ff9ad8684fcbd10d6a3ee8450aaba1..92da2257bdb07107411b2148cfe0e77a6f79302f 100644
--- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
@@ -1,11 +1,16 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
 from unit.common import DistributedTest
+from unit.util import skip_on_arch
 
 
 class Model(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
@@ -16,6 +21,7 @@ class Model(torch.nn.Module):
 
 
 class Adam(torch.optim.Optimizer):
+
     def __init__(self, dense_params, sparse_params):
         super().__init__(dense_params + sparse_params, defaults={})
         self.adam = torch.optim.Adam(dense_params)
@@ -49,16 +55,12 @@ class TestSparseAdam(DistributedTest):
     world_size = 2
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "sparse_gradients": True
-        }
+        skip_on_arch(min_arch=7)
+
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
         model, optimizer = get_model_optimizer()
         loss = torch.nn.BCEWithLogitsLoss()
-        engine, _, _, _ = deepspeed.initialize(model=model,
-                                              optimizer=optimizer,
-                                              config=config_dict)
+        engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
 
         x, offsets, y = get_data(engine.device)
 
diff --git a/tests/unit/runtime/sparse_tensor/test_csr.py b/tests/unit/runtime/sparse_tensor/test_csr.py
index 1e4f81b986e870c1a605379bf5c43b30c6be2a4b..937b981735a5a06a5dcabab6e149abfa594e7884 100644
--- a/tests/unit/runtime/sparse_tensor/test_csr.py
+++ b/tests/unit/runtime/sparse_tensor/test_csr.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import random
diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
index ba9a6b0282235b94692fb47c60419c85c0fb3c5b..0689adc08670dcce02e15a0c29f1bc712b0905f5 100644
--- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
+++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -8,6 +11,7 @@ import deepspeed.utils.groups as groups
 
 
 class Model(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
@@ -18,6 +22,7 @@ class Model(torch.nn.Module):
 
 
 class Adam(torch.optim.Optimizer):
+
     def __init__(self, dense_params, sparse_params):
         super().__init__(dense_params + sparse_params, defaults={})
         self.adam = torch.optim.Adam(dense_params)
@@ -37,38 +42,19 @@ class TestSparseAdam(DistributedTest):
     world_size = 2
 
     def test(self):
-        config_dict = {
-            "train_batch_size": 2,
-            "steps_per_print": 1,
-            "sparse_gradients": True
-        }
+        config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
 
         model = Model()
         optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
-        engine, _, _, _ = deepspeed.initialize(model=model,
-                                              optimizer=optimizer,
-                                              config=config_dict)
+        engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
         loss = torch.nn.BCEWithLogitsLoss()
-        x = torch.tensor([1,
-                          2,
-                          4,
-                          5,
-                          4,
-                          3,
-                          2,
-                          9],
-                         dtype=torch.long,
-                         device=engine.device)
+        x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=engine.device)
         offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device)
         y = torch.tensor([[1.0], [0.0]], device=engine.device)
         res = engine(x, offsets)
         engine.backward(loss(res, y))
         engine.step()
 
-        results = [
-            engine.all_gather_scalar(i,
-                                     groups._get_data_parallel_group())
-            for i in model.emb.parameters()
-        ]
+        results = [engine.all_gather_scalar(i, groups._get_data_parallel_group()) for i in model.emb.parameters()]
         for res in results:
             assert torch.allclose(res[0], res[1])
diff --git a/tests/unit/runtime/test_autocast.py b/tests/unit/runtime/test_autocast.py
index b0d8d8696cb8e5611e4f19d9fed98ed6c8483511..9176770afda798de1735d0b217a3f887578b4fb1 100644
--- a/tests/unit/runtime/test_autocast.py
+++ b/tests/unit/runtime/test_autocast.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import torch
@@ -9,18 +12,15 @@ from unit.common import DistributedTest
 
 @pytest.mark.parametrize('half_op', [False, True])
 class TestAutoCastDisable(DistributedTest):
+
     def test_missing_amp_autocast(self, half_op):
         hidden_dim = 4
         if half_op:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
-            ds_linear = LinearModuleForZeroStage3(
-                hidden_dim,
-                hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half()
         else:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-            ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                                  hidden_dim).to(
-                                                      get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         output = ds_linear(input)
         assert output.dtype == ds_linear.weight.dtype
@@ -31,14 +31,10 @@ class TestAutoCastDisable(DistributedTest):
         hidden_dim = 4
         if half_op:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name()).half()
-            ds_linear = LinearModuleForZeroStage3(
-                hidden_dim,
-                hidden_dim).to(get_accelerator().device_name()).half()
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name()).half()
         else:
             input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-            ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                                  hidden_dim).to(
-                                                      get_accelerator().device_name())
+            ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         with amp.autocast(False):
             output = ds_linear(input)
@@ -46,24 +42,15 @@ class TestAutoCastDisable(DistributedTest):
 
 
 @pytest.mark.skipif(get_accelerator().amp() is None, reason='amp is not installed')
-@pytest.mark.parametrize('half_input, half_weight',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
+@pytest.mark.parametrize('half_input, half_weight', [(False, False), (False, True), (True, False), (True, True)])
 class TestAutoCastEnable(DistributedTest):
+
     def test_autocast_linear(self, tmpdir, half_input, half_weight):
         amp = get_accelerator().amp()
 
         hidden_dim = 4
         input = torch.randn(hidden_dim).to(get_accelerator().device_name())
-        ds_linear = LinearModuleForZeroStage3(hidden_dim,
-                                              hidden_dim).to(
-                                                  get_accelerator().device_name())
+        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).to(get_accelerator().device_name())
 
         if half_input:
             input = input.half()
diff --git a/tests/unit/runtime/test_data.py b/tests/unit/runtime/test_data.py
index ed2fee950bc3b193c3dc7f1470c38771cae02a08..8f71ca979b4df47600c2b67666d6c57402262270 100644
--- a/tests/unit/runtime/test_data.py
+++ b/tests/unit/runtime/test_data.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils import RepeatingLoader
 import torch
@@ -19,24 +22,12 @@ def test_repeating_loader():
         assert next(loader) == 3
 
 
-@pytest.mark.parametrize('train_batch_size, drop_last',
-                         [(1,
-                           True),
-                          (4,
-                           True),
-                          (1,
-                           False),
-                          (4,
-                           False)])
+@pytest.mark.parametrize('train_batch_size, drop_last', [(1, True), (4, True), (1, False), (4, False)])
 class TestDataLoaderDropLast(DistributedTest):
     world_size = 1
 
     def test(self, train_batch_size, drop_last):
-        config_dict = {
-            "train_batch_size": train_batch_size,
-            "dataloader_drop_last": drop_last,
-            "steps_per_print": 1
-        }
+        config_dict = {"train_batch_size": train_batch_size, "dataloader_drop_last": drop_last, "steps_per_print": 1}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
index 993e4aa66e20d6806d30d90ad9d0487f10091289..b9bd9c3aa56ed241b06119527d2b99bf27119309 100644
--- a/tests/unit/runtime/test_data_efficiency.py
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import os
@@ -9,6 +12,7 @@ from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_datalo
 
 
 class MPU():
+
     def __init__(self, tp_world_size):
         self.rank = deepspeed.comm.get_rank()
         self.world_size = deepspeed.comm.get_world_size()
@@ -103,10 +107,10 @@ class TestDataEfficiency(DistributedTest):
         model = SimpleModel(hidden_dim)
         dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half)
         model, _, data_loader, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              training_data=dataset,
-                                              model_parameters=model.parameters(),
-                                              mpu=MPU(1))
+                                                        model=model,
+                                                        training_data=dataset,
+                                                        model_parameters=model.parameters(),
+                                                        mpu=MPU(1))
         if model.mpu.get_data_parallel_rank() == 0 and not os.path.exists('/tmp'):
             os.makedirs('/tmp')
         model.set_data_post_process_func(data_post_process)
@@ -147,15 +151,8 @@ class TestLegacyCurriculumScheduler(DistributedTest):
                 "max_difficulty": 5,
                 "schedule_type": "fixed_discrete",
                 "schedule_config": {
-                    "difficulty": [1,
-                                   2,
-                                   3,
-                                   4,
-                                   5],
-                    "max_step": [2,
-                                 4,
-                                 6,
-                                 8]
+                    "difficulty": [1, 2, 3, 4, 5],
+                    "max_step": [2, 4, 6, 8]
                 }
             }
         }
@@ -163,13 +160,8 @@ class TestLegacyCurriculumScheduler(DistributedTest):
         ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
 
         model = Curriculum_SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
@@ -212,13 +204,8 @@ class TestLegacyCurriculumScheduler(DistributedTest):
         ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
 
         model = Curriculum_SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss, seqlen = model(batch[0], batch[1])
             model.backward(loss)
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 54c91a6fc3e6884e16d63d46ab4cd77b0073e11e..6cd01644fad53253105f9f512c9189bc903b89d6 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 # A test on its own
 import os
@@ -93,10 +96,7 @@ class TestBatchConfig(DistributedTest):
         ds_config = DeepSpeedConfig(ds_batch_config)
 
         #test cases when all parameters are provided
-        status = _run_batch_config(ds_config,
-                                   train_batch=batch,
-                                   micro_batch=micro_batch,
-                                   gas=gas)
+        status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch, gas=gas)
         _batch_assert(status, ds_config, batch, micro_batch, gas, success)
 
         #test cases when two out of three parameters are provided
@@ -139,10 +139,7 @@ def test_temp_config_json(tmpdir):
 
 
 @pytest.mark.parametrize("gather_weights_key",
-                         [
-                             "stage3_gather_16bit_weights_on_model_save",
-                             "stage3_gather_fp16_weights_on_model_save"
-                         ])
+                         ["stage3_gather_16bit_weights_on_model_save", "stage3_gather_fp16_weights_on_model_save"])
 def test_gather_16bit_params_on_model_save(gather_weights_key):
     config_dict = {
         gather_weights_key: True,
@@ -168,9 +165,7 @@ class TestConfigLoad(DistributedTest):
     def test_dict(self, base_config):
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=base_config,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters())
 
     def test_json(self, base_config, tmpdir):
         config_path = os.path.join(tmpdir, "config.json")
@@ -178,9 +173,7 @@ class TestConfigLoad(DistributedTest):
             json.dump(base_config, fp)
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_path,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
     def test_hjson(self, base_config, tmpdir):
         config_path = os.path.join(tmpdir, "config.json")
@@ -188,9 +181,7 @@ class TestConfigLoad(DistributedTest):
             hjson.dump(base_config, fp)
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_path,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
 
 class TestDeprecatedDeepScaleConfig(DistributedTest):
@@ -206,13 +197,8 @@ class TestDeprecatedDeepScaleConfig(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -226,14 +212,11 @@ class TestDistInit(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _,_ = deepspeed.initialize(config=base_config,
-                                             model=model,
-                                             model_parameters=model.parameters(),
-                                             dist_init_required=True)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=base_config,
+                                              model=model,
+                                              model_parameters=model.parameters(),
+                                              dist_init_required=True)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
@@ -250,10 +233,7 @@ class TestInitNoOptimizer(DistributedTest):
         model = SimpleModel(hidden_dim=hidden_dim)
 
         model, _, _, _ = deepspeed.initialize(config=base_config, model=model)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             with pytest.raises(AssertionError):
@@ -268,20 +248,14 @@ class TestArgs(DistributedTest):
     def test_none_args(self, base_config):
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
     def test_no_args(self, base_config):
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(model=model, config=base_config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
 
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 24343a999f695ea3ad7ef22fde50dc0fdb219269..b9c67c9a30dd293a3c8a95be37d24d4841503e86 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import os
@@ -11,10 +14,7 @@ from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 class SimpleConf(DeepSpeedConfigModel):
     param_1: int = 0
-    param_2_old: str = Field(None,
-                             deprecated=True,
-                             new_param="param_2",
-                             new_param_fn=(lambda x: [x]))
+    param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
     param_2: List[str] = None
     param_3: int = Field(0, alias="param_3_alias")
 
@@ -68,16 +68,7 @@ def test_config_base_aliasfield():
     assert config.param_3 == 10
 
 
-@pytest.mark.parametrize("config_dict",
-                         [{
-                             "param_1": "DS"
-                         },
-                          {
-                              "param_2": "DS"
-                          },
-                          {
-                              "param_1_typo": 0
-                          }])
+@pytest.mark.parametrize("config_dict", [{"param_1": "DS"}, {"param_2": "DS"}, {"param_1_typo": 0}])
 def test_config_base_literalfail(config_dict):
     with pytest.raises(ValidationError):
         config = SimpleConf(**config_dict)
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index c7eeef863bdaf12bab84a66597c17263ba6a512f..4ff64dea96ef5c1c93c66cfe0c20723f18bfe8f1 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 from typing import Callable
@@ -61,6 +64,7 @@ class TestClientOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, optimizer_type):
+
         def _optimizer_callable(params) -> Optimizer:
             return AdamW(params=params)
 
@@ -77,9 +81,9 @@ class TestClientOptimizer(DistributedTest):
             client_optimizer = _optimizer_callable
 
         _, ds_optimizer, _, _ = deepspeed.initialize(config=config_dict,
-                                                    model=model,
-                                                    model_parameters=list(model.parameters()),
-                                                    optimizer=client_optimizer)
+                                                     model=model,
+                                                     model_parameters=list(model.parameters()),
+                                                     optimizer=client_optimizer)
         if client_optimizer is None:
             assert isinstance(ds_optimizer, FusedAdam)
         elif isinstance(client_optimizer, Optimizer):
@@ -93,15 +97,7 @@ class TestConfigOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, client_parameters):
-        ds_config = {
-            "train_batch_size": 1,
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 0.001
-                }
-            }
-        }
+        ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}}
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
@@ -111,9 +107,7 @@ class TestConfigOptimizer(DistributedTest):
         else:
             model_parameters = None
 
-        _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                    model=model,
-                                                    model_parameters=model_parameters)
+        _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model_parameters)
 
         assert isinstance(ds_optimizer, FusedAdam)
 
@@ -205,14 +199,14 @@ class TestOptimizerImplementation(DistributedTest):
 
         if key in is_supported:
             _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                        model=model,
-                                                        model_parameters=model_parameters)
+                                                         model=model,
+                                                         model_parameters=model_parameters)
             assert True
         else:
             with pytest.raises(NotImplementedError):
                 _, ds_optimizer, _, _ = deepspeed.initialize(config=ds_config,
-                                                            model=model,
-                                                            model_parameters=model_parameters)
+                                                             model=model,
+                                                             model_parameters=model_parameters)
 
 
 @pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable])
@@ -221,6 +215,7 @@ class TestClientLrScheduler(DistributedTest):
     world_size = 1
 
     def test(self, scheduler_type, optimizer_type):
+
         def _my_lambda(epoch):
             return epoch // 10
 
@@ -252,14 +247,11 @@ class TestClientLrScheduler(DistributedTest):
                 client_scheduler = LambdaLR(client_optimizer, _my_lambda)
             else:
                 # Verify invalid combination is correctly handled
-                client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()),
-                                            _my_lambda)
+                client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda)
         else:
             client_scheduler = _lr_scheduler_callable
 
-        if isinstance(client_scheduler,
-                      _LRScheduler) and not isinstance(client_optimizer,
-                                                       Optimizer):
+        if isinstance(client_scheduler, _LRScheduler) and not isinstance(client_optimizer, Optimizer):
             with pytest.raises(AssertionError):
                 _, _, _, _ = deepspeed.initialize(config=config_dict,
                                                   model=model,
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
index 7afcad5426c4e5c7132ab9a97a3459681eac9289..2393891e28df8f09a8c47453f865d899a8a3c565 100644
--- a/tests/unit/runtime/test_lr_schedulers.py
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -29,21 +32,14 @@ def _verify_staircase_increase(values, step_size):
         assert all([values[i] == v for v in values[i:j]])
 
 
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 10,
-                               TOTAL_NUM_STEPS: 20
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1
-                           }),
-                          (LR_RANGE_TEST,
-                           {})])
+@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}),
+                                                   (WARMUP_DECAY_LR, {
+                                                       WARMUP_NUM_STEPS: 10,
+                                                       TOTAL_NUM_STEPS: 20
+                                                   }), (ONE_CYCLE, {
+                                                       CYCLE_MIN_LR: 0,
+                                                       CYCLE_MAX_LR: 0.1
+                                                   }), (LR_RANGE_TEST, {})])
 class TestGetLrBeforeTrain(DistributedTest):
     world_size = 1
 
@@ -198,26 +194,21 @@ class TestLrSchedule(DistributedTest):
             previous_lr = lr
 
 
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 5,
-                               TOTAL_NUM_STEPS: 10
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1,
-                               CYCLE_FIRST_STEP_SIZE: 5,
-                               DECAY_STEP_SIZE: 5
-                           }),
-                          (LR_RANGE_TEST,
-                           {
-                               LR_RANGE_TEST_MIN_LR: 1e-4,
-                               LR_RANGE_TEST_STEP_SIZE: 1
-                           })])
+@pytest.mark.parametrize("scheduler_type,params", [(WARMUP_LR, {}),
+                                                   (WARMUP_DECAY_LR, {
+                                                       WARMUP_NUM_STEPS: 5,
+                                                       TOTAL_NUM_STEPS: 10
+                                                   }),
+                                                   (ONE_CYCLE, {
+                                                       CYCLE_MIN_LR: 0,
+                                                       CYCLE_MAX_LR: 0.1,
+                                                       CYCLE_FIRST_STEP_SIZE: 5,
+                                                       DECAY_STEP_SIZE: 5
+                                                   }),
+                                                   (LR_RANGE_TEST, {
+                                                       LR_RANGE_TEST_MIN_LR: 1e-4,
+                                                       LR_RANGE_TEST_STEP_SIZE: 1
+                                                   })])
 class TestSchedulerOptimizerParity(DistributedTest):
     world_size = 1
 
@@ -294,8 +285,7 @@ class TestLrRange(DistributedTest):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 2),
+                                        total_samples=max(50, step_size * 2),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
@@ -358,8 +348,7 @@ class TestOneCycle(DistributedTest):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          cycle_step_size * 3),
+                                        total_samples=max(50, cycle_step_size * 3),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
@@ -425,8 +414,7 @@ class TestOneCycle(DistributedTest):
                                                          model=model,
                                                          model_parameters=model.parameters())
         data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 3),
+                                        total_samples=max(50, step_size * 3),
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py
index 0a802373a67a04f394def10958cd0a966251d7c4..d9aba419b1588a89c05c959b54535542947b45c5 100644
--- a/tests/unit/runtime/test_multi_output_model.py
+++ b/tests/unit/runtime/test_multi_output_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -34,18 +37,14 @@ class TestTwoOutputModel(DistributedTest):
         weight_value = 0.1
 
         model = MultiOutputModel(hidden_dim, weight_value)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         total_samples = 4
         data_loader = multi_output_dataloader(model=model,
                                               total_samples=total_samples,
                                               hidden_dim=hidden_dim,
                                               device=model.device,
-                                              inputs=[1.0,
-                                                      2.0],
-                                              targets=[1,
-                                                       2])
+                                              inputs=[1.0, 2.0],
+                                              targets=[1, 2])
         for n, batch in enumerate(data_loader):
             assert len(batch) % 2 == 0, \
                  f"multi_output_dataloader failed to return even number of data samples (input+target)"
@@ -54,9 +53,7 @@ class TestTwoOutputModel(DistributedTest):
             inputs, targets = batch[:midpoint], batch[midpoint:]
             loss_tuple = model(inputs, targets)
 
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
                 assert loss.item() == approx(expected_loss.item())
@@ -96,21 +93,15 @@ class TestThreeOutputModel(DistributedTest):
         weight_value = 0.1
 
         model = MultiOutputModel(hidden_dim, weight_value)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         total_samples = grad_accumulation_steps * micro_batch_size * 2
         data_loader = multi_output_dataloader(model=model,
                                               total_samples=total_samples,
                                               hidden_dim=hidden_dim,
                                               device=model.device,
-                                              inputs=[1.0,
-                                                      2.0,
-                                                      3.0],
-                                              targets=[1,
-                                                       2,
-                                                       3])
+                                              inputs=[1.0, 2.0, 3.0],
+                                              targets=[1, 2, 3])
         for n, batch in enumerate(data_loader):
             assert len(batch) % 2 == 0, \
                  f"multi_output_dataloader failed to return even number of data samples (input+target)"
@@ -120,9 +111,7 @@ class TestThreeOutputModel(DistributedTest):
             loss_tuple = model(inputs, targets)
             assert len(loss_tuple) == 3
 
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
 
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py
index 8b8ed2365d77678fda2eb55bd74c9715dbaa1129..1f602db73b2f2f5f1cc5ce456a41126729ff958a 100644
--- a/tests/unit/runtime/test_pld.py
+++ b/tests/unit/runtime/test_pld.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import numpy as np
 import deepspeed
@@ -48,14 +51,9 @@ class TestPLDModel(DistributedTest):
         hidden_dim = 10
 
         model = PLD_SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -94,14 +92,9 @@ class TestNonPLDModel(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=False)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+
+        data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             with pytest.raises(TypeError):
diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py
index 18a8bb77a5b6a63fe644b288571f808ca4c00e82..5d8478b249becb788f728a722bc056710d0ee817 100644
--- a/tests/unit/runtime/test_runtime_utils.py
+++ b/tests/unit/runtime/test_runtime_utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 from torch._utils import _flatten_dense_tensors
@@ -41,9 +44,7 @@ class TestClibGradNorm(DistributedTest):
         norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
         norm = torch.Tensor([norm]).to(get_accelerator().device_name(dist.get_rank()))
         world_size = dist.get_world_size()
-        gathered_norm = [
-            torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)
-        ]
+        gathered_norm = [torch.zeros(1).to(get_accelerator().device_name()) for i in range(world_size)]
 
         dist.all_gather(gathered_norm, norm)
 
diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py
index 58b62825de3f1de6e97b9a152bd9e59e19527936..e7085ee2c4bd9c162068721d6d86accf6e3e7a42 100644
--- a/tests/unit/runtime/utils/test_partition.py
+++ b/tests/unit/runtime/utils/test_partition.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 
@@ -164,33 +167,9 @@ def test_float_midheavy():
 def test_balance_bert():
     # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
     weights = [
-        52559872,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        0,
-        52559872
+        52559872, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224,
+        12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224, 12596224,
+        12596224, 12596224, 12596224, 0, 52559872
     ]
     P = 8
     parts = partition_balanced(weights, P)
diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
index efd4949c94606384c59dcf76d1f35e7d140eefdf..aade488fde42fbff54411459fe5cc7b9b22db2a3 100644
--- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py
+++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 from unit.common import DistributedTest
@@ -41,14 +44,9 @@ class TestStage2IgnoreUnusedParameters(DistributedTest):
         hidden_dim = 4
 
         model = UnusedParametersModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device)
 
         def _loop():
             for n, batch in enumerate(data_loader):
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index 5de3ffca27df16cfe0b9f15d66bbcaa3d41d2cf2..5773c060cf74f6f31bcffc9c117482db8cfe6c43 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -1,7 +1,11 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import math
-from typing import Dict, List, Set
+from collections import namedtuple
+from typing import Dict, List, NamedTuple, Set, Tuple
 import pytest
 import deepspeed.comm as dist
 import torch
@@ -18,10 +22,12 @@ import deepspeed
 from deepspeed.runtime.engine import DeepSpeedEngine
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+from deepspeed.runtime.zero.utils import ZeRORuntimeException
 from deepspeed.accelerator import get_accelerator
 
 
 def run_unbalanced_gradients(model, data_loader):
+
     def drop_some_gradients(model, iter):
         odd_iteration = iter % 2
         for i, p in enumerate(model.parameters()):
@@ -72,13 +78,8 @@ class TestZeroUnbalancedGradients(DistributedTest):
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         run_unbalanced_gradients(model, data_loader)
 
@@ -111,6 +112,7 @@ class TestZero3RepeatForwardLoop(DistributedTest):
         hidden_dim = 4
 
         class AlbertLikeModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super().__init__()
                 self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -124,13 +126,8 @@ class TestZero3RepeatForwardLoop(DistributedTest):
                 return self.cross_entropy_loss(hidden, y)
 
         model = AlbertLikeModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -141,10 +138,11 @@ class TestZero3RepeatForwardLoop(DistributedTest):
 # testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
 # also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
 @pytest.mark.parametrize('zero_stage', [2, 3])
+@pytest.mark.parametrize('freeze_params', [True, False])
 class TestZeroToFP32(DistributedTest):
     world_size = 2
 
-    def test_1_param_group(self, tmpdir, zero_stage):
+    def test_1_param_group(self, tmpdir, zero_stage, freeze_params):
         # XXX: ideally refactor with the 2_param_group test as 75% is the same
         # force all params to be partitioned by forcing threshold=0
         config_dict = {
@@ -168,18 +166,20 @@ class TestZeroToFP32(DistributedTest):
         }
 
         class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
+
+            def __init__(self, hidden_dim, n_layers, freeze_params):
                 super().__init__()
                 # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that
                 # the number of total elements is uneven:
                 # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
+                self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers))
                 # (2) the following adds 4+1=5 elements
                 self.classifier = torch.nn.Linear(4, 1)
                 # total 48+5=53 (uneven as desired) elements
                 self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+                if freeze_params:
+                    self.ll[0].weight.requires_grad = False
+                    self.ll[0].bias.requires_grad = False
 
             def forward(self, x, y):
                 hidden = x
@@ -192,21 +192,20 @@ class TestZeroToFP32(DistributedTest):
         world_size = dist.get_world_size()
         # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2
         n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
 
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        # Flush zero stage 3 cache
+        model.empty_partition_cache()
+
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
+        model.empty_partition_cache()
         model.save_checkpoint(tmpdir)
 
         # make sure all sides saved it
@@ -221,8 +220,7 @@ class TestZeroToFP32(DistributedTest):
                 orig_state_dict[name] = param.detach().cpu()
 
         if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(model.parameters(),
-                                                   modifier_rank=None):
+            with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None):
                 fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
                 fp32_state_dict = fp32_model.state_dict()
         else:
@@ -234,10 +232,9 @@ class TestZeroToFP32(DistributedTest):
         if dist.get_rank() == 0:
             for name in orig_state_dict.keys():
                 # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
+                assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
-    def test_2_param_groups(self, tmpdir, zero_stage):
+    def test_2_param_groups(self, tmpdir, zero_stage, freeze_params):
         # TODO:
         # - need to test with multiple param groups
         # force all params to be partitioned by forcing threshold=0
@@ -263,12 +260,14 @@ class TestZeroToFP32(DistributedTest):
         }
 
         class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
+
+            def __init__(self, hidden_dim, n_layers, freeze_params):
                 super().__init__()
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
+                self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers))
                 self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+                if freeze_params:
+                    self.ll[0].weight.requires_grad = False
+                    self.ll[0].bias.requires_grad = False
 
             def forward(self, x, y):
                 hidden = x
@@ -280,7 +279,7 @@ class TestZeroToFP32(DistributedTest):
 
         world_size = dist.get_world_size()
         n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
+        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers, freeze_params=freeze_params)
 
         optim_groups = [
             {
@@ -297,18 +296,17 @@ class TestZeroToFP32(DistributedTest):
         model, _, _, _ = deepspeed.initialize(model=model,
                                               model_parameters=model.parameters(),
                                               optimizer=optim,
-                                              config=config_dict
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+                                              config=config_dict)
+        model.empty_partition_cache()
+
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
 
+        model.empty_partition_cache()
         model.save_checkpoint(tmpdir)
 
         # make sure all sides saved it
@@ -325,8 +323,7 @@ class TestZeroToFP32(DistributedTest):
                 orig_state_dict[name] = param.detach().cpu()
 
         if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(model.parameters(),
-                                                   modifier_rank=None):
+            with deepspeed.zero.GatheredParameters(model.parameters(), modifier_rank=None):
                 fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
                 fp32_state_dict = fp32_model.state_dict()
         else:
@@ -338,8 +335,7 @@ class TestZeroToFP32(DistributedTest):
         if dist.get_rank() == 0:
             for name in orig_state_dict.keys():
                 # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
+                assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
 
 @pytest.mark.parametrize("allgather_bucket_size", [1000, 1001])
@@ -370,16 +366,13 @@ class TestIncorectAllgatherBucketSize(DistributedTest):
 
         model = SimpleModel(hidden_dim=hidden_dim)
         if allgather_bucket_size % 2 == 0:
-            model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+            model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
         else:
             with pytest.raises(AssertionError) as assertinfo:
                 model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-            assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(
-                assertinfo)
+                                                      model=model,
+                                                      model_parameters=model.parameters())
+            assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(assertinfo)
 
 
 class TestPartitionNcclAlignment(DistributedTest):
@@ -407,9 +400,7 @@ class TestPartitionNcclAlignment(DistributedTest):
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
 
         # get nccl all-gather send buffers alignment factor
         nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
@@ -418,23 +409,16 @@ class TestPartitionNcclAlignment(DistributedTest):
         for data_parallel_partitions in parallel_partitioned_bit16_groups:
             for partition_id, partitioned_data in enumerate(data_parallel_partitions):
                 # verify that data partition start locations are 4-byte aligned
-                assert (partitioned_data.data_ptr() %
-                        (2 * nccl_start_alignment_factor) == 0)
+                assert (partitioned_data.data_ptr() % (2 * nccl_start_alignment_factor) == 0)
 
 
-def _ds_initialize_for_param_partitioning_testing(model: Module,
-                                                  cfg: dict) -> DeepSpeedEngine:
-    ds_engine, _, _, _ = deepspeed.initialize(
-        config=cfg,
-        model=model,
-        model_parameters=model.parameters()
-    )
+def _ds_initialize_for_param_partitioning_testing(model: Module, cfg: dict) -> DeepSpeedEngine:
+    ds_engine, _, _, _ = deepspeed.initialize(config=cfg, model=model, model_parameters=model.parameters())
 
     return ds_engine
 
 
-def _assert_partition_status(model: Module,
-                             valid_statuses: Set[ZeroParamStatus]) -> None:
+def _assert_partition_status(model: Module, valid_statuses: Set[ZeroParamStatus]) -> None:
     for _, param in model.named_parameters():
         assert param.ds_status in valid_statuses, param.ds_summary()
 
@@ -445,6 +429,7 @@ def _assert_fully_available(model: Module) -> None:
 
 
 class EltwiseMultiplicationModule(Module):
+
     def __init__(self, weight: Parameter) -> None:
         super().__init__()
         self.weight = weight
@@ -456,8 +441,9 @@ class EltwiseMultiplicationModule(Module):
         return result
 
 
-class EltwiseMultiplicationTestNetwork(Module):
+class EltwiseMultiplicationTestNetwork_Dict(Module):
     """used for testing purposes"""
+
     def __init__(
         self,
         weight1: Parameter,
@@ -471,29 +457,18 @@ class EltwiseMultiplicationTestNetwork(Module):
 
         self.loss = L1Loss(reduction="none")
 
-    def forward(self,
-                x: Tensor,
-                y: Tensor,
-                use_module_trace: bool,
-                param_prefetching: bool) -> Dict[str,
-                                                 Tensor]:
-        _assert_partition_status(
-            self,
-            {
-                ZeroParamStatus.NOT_AVAILABLE,
-                ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+    def forward(self, x: Tensor, y: Tensor, use_module_trace: bool, param_prefetching: bool) -> Dict[str, Tensor]:
+        _assert_partition_status(self,
+                                 {ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE}
+                                 if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
 
         pre_layer_expected_states = {
-            ZeroParamStatus.INFLIGHT
-            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+            ZeroParamStatus.INFLIGHT if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
             ZeroParamStatus.AVAILABLE,
         }
 
         post_layer_expected_states = {
-            ZeroParamStatus.AVAILABLE
-            if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
+            ZeroParamStatus.AVAILABLE if param_prefetching else ZeroParamStatus.NOT_AVAILABLE,
         }
 
         _assert_partition_status(self.__layer1, pre_layer_expected_states)
@@ -510,13 +485,9 @@ class EltwiseMultiplicationTestNetwork(Module):
 
         loss = self.loss(y_hat, y)
 
-        _assert_partition_status(
-            self,
-            {
-                ZeroParamStatus.NOT_AVAILABLE,
-                ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
+        _assert_partition_status(self,
+                                 {ZeroParamStatus.NOT_AVAILABLE, ZeroParamStatus.INFLIGHT, ZeroParamStatus.AVAILABLE}
+                                 if use_module_trace else {ZeroParamStatus.NOT_AVAILABLE})
 
         return {
             "hidden1": hidden1,
@@ -525,6 +496,91 @@ class EltwiseMultiplicationTestNetwork(Module):
             "loss": loss,
         }
 
+    @staticmethod
+    def to_dict(outputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        return outputs
+
+
+class EltwiseMultiplicationNamedTuple(NamedTuple):
+    hidden1: Tensor
+    hidden2: Tensor
+    y_hat: Tensor
+    loss: Tensor
+
+
+class EltwiseMultiplicationTestNetwork_NamedTuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> EltwiseMultiplicationNamedTuple:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return EltwiseMultiplicationNamedTuple(hidden1=outputs_dicts['hidden1'],
+                                               hidden2=outputs_dicts['hidden2'],
+                                               y_hat=outputs_dicts['y_hat'],
+                                               loss=outputs_dicts['loss'])
+
+    @staticmethod
+    def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs.hidden1,
+            "hidden2": outputs.hidden2,
+            "y_hat": outputs.y_hat,
+            "loss": outputs.loss,
+        }
+
+
+EltwiseMultiplication_namedtuple = namedtuple('EltwiseMultiplication_namedtuple',
+                                              ['hidden1', 'hidden2', 'y_hat', 'loss'])
+
+
+class EltwiseMultiplicationTestNetwork_namedtuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> EltwiseMultiplication_namedtuple:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return EltwiseMultiplication_namedtuple(hidden1=outputs_dicts['hidden1'],
+                                                hidden2=outputs_dicts['hidden2'],
+                                                y_hat=outputs_dicts['y_hat'],
+                                                loss=outputs_dicts['loss'])
+
+    @staticmethod
+    def to_dict(outputs: EltwiseMultiplicationNamedTuple) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs.hidden1,
+            "hidden2": outputs.hidden2,
+            "y_hat": outputs.y_hat,
+            "loss": outputs.loss,
+        }
+
+
+class EltwiseMultiplicationTestNetwork_Tuple(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return (outputs_dicts['hidden1'], outputs_dicts['hidden2'], outputs_dicts['y_hat'], outputs_dicts['loss'])
+
+    @staticmethod
+    def to_dict(outputs: Tuple[Tensor, Tensor, Tensor, Tensor]) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs[0],
+            "hidden2": outputs[1],
+            "y_hat": outputs[2],
+            "loss": outputs[3],
+        }
+
+
+class EltwiseMultiplicationTestNetwork_List(EltwiseMultiplicationTestNetwork_Dict):
+
+    def forward(self, *args, **kwargs) -> List[Tensor]:
+        outputs_dicts = super().forward(*args, **kwargs)
+        return [outputs_dicts['hidden1'], outputs_dicts['hidden2'], outputs_dicts['y_hat'], outputs_dicts['loss']]
+
+    @staticmethod
+    def to_dict(outputs: List[Tensor]) -> Dict[str, Tensor]:
+        return {
+            "hidden1": outputs[0],
+            "hidden2": outputs[1],
+            "y_hat": outputs[2],
+            "loss": outputs[3],
+        }
+
 
 @pytest.mark.parametrize("param_persistence_threshold", [0, 10])
 @pytest.mark.parametrize("fp16_enabled", [True, False])
@@ -532,6 +588,11 @@ class EltwiseMultiplicationTestNetwork(Module):
 @pytest.mark.parametrize("offload_optimizer", [True, False])
 @pytest.mark.parametrize("zero_grad", [True, False])
 @pytest.mark.parametrize("prefetching", [True, False])
+@pytest.mark.parametrize("model_class", [
+    EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
+    EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
+    EltwiseMultiplicationTestNetwork_List
+])
 class TestZero3ParamPartitioningBase(DistributedTest):
     world_size = 2
 
@@ -543,6 +604,7 @@ class TestZero3ParamPartitioningBase(DistributedTest):
         offload_optimizer: bool,
         zero_grad: bool,
         prefetching: bool,
+        model_class: EltwiseMultiplicationTestNetwork_Dict,
     ) -> None:
         if offload_optimizer and not contiguous_gradients:
             return
@@ -550,7 +612,7 @@ class TestZero3ParamPartitioningBase(DistributedTest):
         m = 3
         n = 5
         weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
+        model = model_class(*weights)
         prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
         cfg = {
             "train_micro_batch_size_per_gpu": 1,
@@ -581,95 +643,39 @@ class TestZero3ParamPartitioningBase(DistributedTest):
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
         for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
 
         def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
             return torch.as_tensor(vals,
-                                   dtype=dtype
-                                   or (torch.float16 if fp16_enabled else torch.float32),
+                                   dtype=dtype or (torch.float16 if fp16_enabled else torch.float32),
                                    device=ds_engine.device)
 
         expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
+            [1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2],
+            [2, 2, 2, 2, 2],
         ])
         expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
+            [2, 2, 2, 2, 2],
+            [2, 2, 2, 8, 8],
+            [8, 8, 8, 8, 8],
         ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
+        expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]])
         expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
+            [5, 5, 5, 5, 5],
+            [5, 5, 5, 47, 47],
+            [47, 47, 47, 47, 47],
         ])
 
         for train_iter in range(3):
             activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
+                x=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device),
+                y=torch.ones((m, n), dtype=torch.float16 if fp16_enabled else torch.float32, device=ds_engine.device),
                 use_module_trace=train_iter > 0,
                 param_prefetching=prefetching and train_iter > 0,
             )
+            # for ease in testing convert outputs to dict.
+            activations = model_class.to_dict(activations)
             assert torch.allclose(activations["hidden1"], expected_hidden1)
             assert torch.allclose(activations["hidden2"], expected_hidden2)
             assert torch.allclose(activations["y_hat"], expected_yhat)
@@ -679,7 +685,8 @@ class TestZero3ParamPartitioningBase(DistributedTest):
 
             # check the gradients
             grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions.keys()) == {0
+                                                   }, f"should have one parameter group but got {len(grad_partitions)}"
             assert set(grad_partitions[0].keys()) == {0, 1, 2}
             dloss_wrt_layer1 = grad_partitions[0][0]
             dloss_wrt_layer2 = grad_partitions[0][1]
@@ -698,33 +705,21 @@ class TestZero3ParamPartitioningBase(DistributedTest):
 
             grad_multiplier = 1 if zero_grad else (train_iter + 1)
             if dist.get_rank() == 0:
-                assert torch.allclose(
-                    dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([2] * 8,
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 1] * 8,
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 2 * 1] * 8,
-                                                    torch.float))
+                assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([2] * 8, torch.float))
+                assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([3 * 1] * 8, torch.float))
+                assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([3 * 2 * 1] * 8, torch.float))
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
-                assert torch.allclose(
-                    dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([8] * 7) + [0],
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0],
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0],
-                                                    torch.float))
+                assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([8] * 7) + [0], torch.float))
+                assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([6 * 2] * 7) + [0], torch.float))
+                assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0], torch.float))
             else:
                 raise RuntimeError("test has world size of two")
 
@@ -745,7 +740,9 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest):
     world_size = 4
 
     def test(self, init_context_manager: bool, param_sz: int = 8100) -> None:
+
         class LargeParamModel(Module):
+
             def __init__(self):
                 super().__init__()
                 self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32))
@@ -782,25 +779,17 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest):
                 "loss_scale": 1.,
             }
         }
-        with deepspeed.zero.Init(mem_efficient_linear=False,
-                                 enabled=init_context_manager):
+        with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
             model = LargeParamModel()
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
 
         for train_iter in range(3):  # test multiple iterations to cover prefetching
-            activation: Tensor = ds_engine(
-                torch.ones(param_sz,
-                           dtype=torch.float16,
-                           device=ds_engine.device))
+            activation: Tensor = ds_engine(torch.ones(param_sz, dtype=torch.float16, device=ds_engine.device))
 
             partition_sz = math.ceil(param_sz / self.world_size)
             for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
-                activation_from_partition = activation[start_idx:start_idx +
-                                                       partition_sz]
-                assert torch.allclose(
-                    activation_from_partition,
-                    torch.full_like(activation_from_partition,
-                                    rank_idx))
+                activation_from_partition = activation[start_idx:start_idx + partition_sz]
+                assert torch.allclose(activation_from_partition, torch.full_like(activation_from_partition, rank_idx))
 
             ds_engine.backward(activation.sum())
             ds_engine.allreduce_gradients()
@@ -808,9 +797,7 @@ class TestZero3ParamPartitioningLargeParam(DistributedTest):
             avgd_gradients = ds_engine.optimizer.averaged_gradients
             assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
             weight_gradient, = avgd_gradients[0]
-            expected_weight_gradient = (train_iter + 1) * torch.full_like(
-                weight_gradient,
-                1)
+            expected_weight_gradient = (train_iter + 1) * torch.full_like(weight_gradient, 1)
 
             assert torch.allclose(weight_gradient, expected_weight_gradient)
 
@@ -822,27 +809,24 @@ class TestZero3ParamPartitioningManyParams(DistributedTest):
     world_size = 4
 
     def test(self, param_sz: int, n_layers: int, init_context_manager: bool) -> None:
+
         class ManyParamModel(Module):
+
             def __init__(self) -> None:
                 super().__init__()
 
                 self.modulelist = ModuleList(
-                    EltwiseMultiplicationModule(
-                        weight=Parameter(torch.empty((param_sz,
-                                                      ),
-                                                     dtype=torch.float32)))
+                    EltwiseMultiplicationModule(weight=Parameter(torch.empty((param_sz, ), dtype=torch.float32)))
                     for _ in range(n_layers))
 
                 for layer_num, module in enumerate(self.modulelist):
-                    with deepspeed.zero.GatheredParameters(module.weight,
-                                                           modifier_rank=0):
+                    with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                         param: Parameter = module.weight
                         partition_sz = math.ceil(param.numel() / dist.get_world_size())
                         offset = 0
                         for rank in range(dist.get_world_size()):
                             with torch.no_grad():
-                                param[offset:offset + partition_sz].fill_(2 * layer_num *
-                                                                          rank)
+                                param[offset:offset + partition_sz].fill_(2 * layer_num * rank)
                             offset += partition_sz
 
             def forward(self, x: Tensor) -> Tensor:
@@ -874,28 +858,20 @@ class TestZero3ParamPartitioningManyParams(DistributedTest):
             }
         }
 
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=init_context_manager):
+        with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
             model = ManyParamModel()
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
 
         for _ in range(3):  # test multiple iterations to cover prefetching
             activations: List[Tensor] = ds_engine(
-                torch.ones((param_sz,
-                            ),
-                           dtype=torch.float16,
-                           device=ds_engine.device))
+                torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device))
             assert len(activations) == n_layers
 
             partition_sz = math.ceil(param_sz / self.world_size)
-            expected_activations = torch.empty(param_sz,
-                                               dtype=torch.float16,
-                                               device=ds_engine.device)
+            expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device)
             for start_idx in range(0, param_sz, partition_sz):
-                expected_activations[start_idx:start_idx +
-                                     partition_sz] = dist.get_rank()
+                expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank()
 
             for layer_num, activation in enumerate(activations):
                 expected_activations *= 2 * layer_num
@@ -916,7 +892,9 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
     world_size = 4
 
     def test(self):
+
         class ModelWhereParentInitializesChildWeights(Module):
+
             def __init__(self) -> None:
                 super().__init__()
 
@@ -949,15 +927,11 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
             }
         }
 
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=True):
+        with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
             model = ModelWhereParentInitializesChildWeights()
 
         assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / self.world_size)
-        assert torch.allclose(model.linear.weight.ds_tensor,
-                              torch.full_like(model.linear.weight.ds_tensor,
-                                              1))
+        assert torch.allclose(model.linear.weight.ds_tensor, torch.full_like(model.linear.weight.ds_tensor, 1))
 
 
 @pytest.mark.skip("not working")
@@ -966,24 +940,23 @@ class TestZero3InitForParentWeightInitialization(DistributedTest):
 @pytest.mark.parametrize("offload_optimizer", [True, False])
 @pytest.mark.parametrize("zero_grad", [True, False])
 @pytest.mark.parametrize("prefetching", [True, False])
+@pytest.mark.parametrize("model_class", [
+    EltwiseMultiplicationTestNetwork_Dict, EltwiseMultiplicationTestNetwork_NamedTuple,
+    EltwiseMultiplicationTestNetwork_namedtuple, EltwiseMultiplicationTestNetwork_Tuple,
+    EltwiseMultiplicationTestNetwork_List
+])
 class TestZero3ParamPartitioningBaseBF16(DistributedTest):
     world_size = 2
 
-    def test(
-        self,
-        param_persistence_threshold: int,
-        contiguous_gradients: bool,
-        offload_optimizer: bool,
-        zero_grad: bool,
-        prefetching: bool,
-    ) -> None:
+    def test(self, param_persistence_threshold: int, contiguous_gradients: bool, offload_optimizer: bool,
+             zero_grad: bool, prefetching: bool, model_class: EltwiseMultiplicationTestNetwork_Dict) -> None:
         if offload_optimizer and not contiguous_gradients:
             return
 
         m = 3
         n = 5
         weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
+        model = model_class(*weights)
         prefetch_bucket_size = sum([p.numel() for p in model.parameters(recurse=True)])
         cfg = {
             "train_micro_batch_size_per_gpu": 1,
@@ -1014,93 +987,38 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest):
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
         for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
+            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data, (i + 1) * (1 + dist.get_rank()))
 
         def create_tensor(vals):
             return torch.as_tensor(vals, dtype=torch.bfloat16, device=ds_engine.device)
 
         expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
+            [1, 1, 1, 1, 1],
+            [1, 1, 1, 2, 2],
+            [2, 2, 2, 2, 2],
         ])
         expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
+            [2, 2, 2, 2, 2],
+            [2, 2, 2, 8, 8],
+            [8, 8, 8, 8, 8],
         ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
+        expected_yhat = create_tensor([[6, 6, 6, 6, 6], [6, 6, 6, 48, 48], [48, 48, 48, 48, 48]])
         expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
+            [5, 5, 5, 5, 5],
+            [5, 5, 5, 47, 47],
+            [47, 47, 47, 47, 47],
         ])
 
         for train_iter in range(3):
             _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
             activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
+                x=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device),
+                y=torch.ones((m, n), dtype=torch.bfloat16, device=ds_engine.device),
                 use_module_trace=train_iter > 0,
                 param_prefetching=prefetching and train_iter > 0,
             )
+            # for ease in testing convert outputs to dict.
+            activations = model_class.to_dict(activations)
             assert torch.allclose(activations["hidden1"], expected_hidden1)
             assert torch.allclose(activations["hidden2"], expected_hidden2)
             assert torch.allclose(activations["y_hat"], expected_yhat)
@@ -1111,7 +1029,8 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest):
 
             # check the gradients
             grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
+            assert set(grad_partitions.keys()) == {0
+                                                   }, f"should have one parameter group but got {len(grad_partitions)}"
             assert set(grad_partitions[0].keys()) == {0, 1, 2}
             dloss_wrt_layer1 = grad_partitions[0][0]
             dloss_wrt_layer2 = grad_partitions[0][1]
@@ -1128,31 +1047,21 @@ class TestZero3ParamPartitioningBaseBF16(DistributedTest):
 
             grad_multiplier = 1 if zero_grad else (train_iter + 1)
             if dist.get_rank() == 0:
-                assert torch.allclose(
-                    dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
             elif dist.get_rank() == 1:
                 # parameters dont split evenly across ranks so rank 1 has a zero-padded
                 # partition
-                assert torch.allclose(
-                    dloss_wrt_layer3.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer2.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer1.to(get_accelerator().device_name()),
-                    grad_multiplier *
-                    create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer3.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer2.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
+                assert torch.allclose(dloss_wrt_layer1.to(get_accelerator().device_name()),
+                                      grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
             else:
                 raise RuntimeError("test has world size of two")
 
@@ -1192,13 +1101,8 @@ class TestZeroOffloadStage1(DistributedTest):
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1230,6 +1134,7 @@ class TestZero3DictFwd(DistributedTest):
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -1251,13 +1156,8 @@ class TestZero3DictFwd(DistributedTest):
         with deepspeed.zero.Init():
             model = MyModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1301,10 +1201,7 @@ class TestZeroAdamOptimizerStepCount(DistributedTest):
         model, optimizer, _, _ = deepspeed.initialize(config=config_dict,
                                                       model=model,
                                                       model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         for i, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
@@ -1349,6 +1246,7 @@ class TestZeroFrozenWeights(DistributedTest):
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -1371,16 +1269,90 @@ class TestZeroFrozenWeights(DistributedTest):
         with deepspeed.zero.Init(config_dict_or_path=config_dict):
             model = MyModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         dist.barrier()
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             loss = loss[1]
             model.backward(loss)
             model.step()
+
+
+@pytest.mark.parametrize('force_ds_optim', [True, False])
+class TestZeroOffloadOptim(DistributedTest):
+    world_size = 1
+
+    def test(self, force_ds_optim):
+        config_dict = {
+            "train_batch_size": 4,
+            "gradient_accumulation_steps": 2,
+            "steps_per_print": 1,
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": 1,
+                "offload_optimizer": {
+                    "device": "cpu"
+                }
+            },
+            "zero_force_ds_cpu_optimizer": force_ds_optim,
+        }
+        hidden_dim = 10
+
+        model = SimpleModel(hidden_dim)
+
+        optimizer = torch.optim.Adam(model.parameters())
+
+        if force_ds_optim:
+            with pytest.raises(ZeRORuntimeException):
+                model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
+        else:
+            model, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict)
+
+
+@pytest.mark.parametrize('training', [True, False])
+class TestZeroPartitionCache(DistributedTest):
+    world_size = 1
+
+    def test_training_partition_cache(self, training):
+        hidden_dim = 10
+        config_dict = {
+            "train_batch_size": 2,
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_param_persistence_threshold": hidden_dim
+            }
+        }
+        if training:
+            config_dict["optimizer"] = {"type": "Adam"}
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, empty_grad=False)
+
+        model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
+
+        dtype = torch.half
+        data_loader = random_dataloader(model=model,
+                                        total_samples=6,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+
+        for _, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            if training:
+                model.backward(loss)
+                model.step()
+
+        persist_param_size = sum([p.numel() for p in model.parameters() if p.ds_persist])
+
+        assert persist_param_size >= sum([p.numel() for p in model.parameters()])
+
+        model.empty_partition_cache()
+        assert sum([p.numel() for p in model.parameters()]) == 0
diff --git a/tests/unit/runtime/zero/test_zero_config.py b/tests/unit/runtime/zero/test_zero_config.py
index 84852ec2e6f812a8ae393413dfc8867766e5254d..a5bd96c411e0549e9e359a33715c246900aa3600 100644
--- a/tests/unit/runtime/zero/test_zero_config.py
+++ b/tests/unit/runtime/zero/test_zero_config.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig, DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig
 
@@ -55,20 +58,11 @@ def test_zero_offload_optimizer_config_pipeline():
     config = DeepSpeedZeroOffloadOptimizerConfig()
     assert config.pipeline == False
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": True,
-        "pipeline_write": False
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": False})
     assert config.pipeline == True
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": False,
-        "pipeline_write": True
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": False, "pipeline_write": True})
     assert config.pipeline == True
 
-    config = DeepSpeedZeroOffloadOptimizerConfig(**{
-        "pipeline_read": True,
-        "pipeline_write": True
-    })
+    config = DeepSpeedZeroOffloadOptimizerConfig(**{"pipeline_read": True, "pipeline_write": True})
     assert config.pipeline == True
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
index a88db44888efd35546ff09f38bcac27f44f5040d..aabe7f0b7f15a8d38ed0d1869fa692d7f07dea2e 100644
--- a/tests/unit/runtime/zero/test_zero_context.py
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import SimpleNamespace
 
@@ -14,6 +17,7 @@ from utils import setup_serial_env
 
 # Test that no sub-class or super-class is missed
 class ConvX(torch.nn.Conv1d):
+
     def __init__(self, *args):
         super().__init__(*args)
         # This would not be partitioned before bugfix 5ca8167
@@ -24,6 +28,7 @@ class ConvX(torch.nn.Conv1d):
 
 
 class ConvNet(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
         self.conv1 = ConvX(1, 3, 4)
@@ -61,6 +66,7 @@ class TestZeroGatheredParametersFree(DistributedTest):
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
+
             def __init__(self, hidden_dim):
                 super(MyModel, self).__init__()
                 self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -126,9 +132,9 @@ class TestSerialContext(DistributedTest):
         args = SimpleNamespace(local_rank=0)
         net = SimpleModel(hidden_dim=4)
         engine, _, _, _ = deepspeed.initialize(args=args,
-                                            config=config_dict,
-                                            model=net,
-                                            model_parameters=net.parameters())
+                                               config=config_dict,
+                                               model=net,
+                                               model_parameters=net.parameters())
         assert engine.tput_timer.batch_size == train_micro_batch_size_per_gpu * gradient_accumulation_steps
 
         assert not engine.tput_timer.initialized
@@ -167,11 +173,9 @@ class TestSerialContext(DistributedTest):
         assert engine.tput_timer.total_elapsed_time == 0
 
         # calling start()/stop() to increment the step counter until start_step
-        while engine.tput_timer.micro_step_count < (gradient_accumulation_steps *
-                                                    engine.tput_timer.start_step):
+        while engine.tput_timer.micro_step_count < (gradient_accumulation_steps * engine.tput_timer.start_step):
             engine.tput_timer.start()
-            global_step = (engine.tput_timer.micro_step_count +
-                           1) % gradient_accumulation_steps == 0
+            global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0
             engine.tput_timer.stop(global_step=global_step)
         assert engine.tput_timer.global_step_count == engine.tput_timer.start_step
         assert engine.tput_timer.total_elapsed_time == 0
@@ -182,20 +186,20 @@ class TestSerialContext(DistributedTest):
             current_duration = engine.tput_timer.step_elapsed_time
             total_duration = engine.tput_timer.total_elapsed_time
 
-            global_step = (engine.tput_timer.micro_step_count +
-                           1) % gradient_accumulation_steps == 0
+            global_step = (engine.tput_timer.micro_step_count + 1) % gradient_accumulation_steps == 0
             engine.tput_timer.stop(global_step=global_step)
             duration = engine.tput_timer.end_time - engine.tput_timer.start_time
             # step elapsed time is reset after gradient accumulation steps
             assert engine.tput_timer.step_elapsed_time == (
-                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step
-                else current_duration + duration)
+                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step else current_duration +
+                duration)
             assert engine.tput_timer.total_elapsed_time == total_duration + duration
 
     def test_ext_param_getattr(self):
         setup_serial_env()
 
         class ExtLinear(torch.nn.Module):
+
             def __init__(self, dim=16):
                 super().__init__()
                 self.dim = dim
@@ -214,9 +218,9 @@ class TestSerialContext(DistributedTest):
 
         args = SimpleNamespace(local_rank=0)
         engine, optim, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+                                                   model=net,
+                                                   model_parameters=net.parameters(),
+                                                   config=config)
 
         with deepspeed.zero.GatheredParameters(net.linear1.weight):
             assert net.linear1.weight.numel() == net.dim**2
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
index 38ae524906d5acbcf366773ec93ec028e3a2b77f..21955f5df152b5ab6ea311e39c46e283f58d1509 100644
--- a/tests/unit/runtime/zero/test_zero_context_ancestry.py
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import deepspeed
@@ -31,32 +34,30 @@ config = {
 # test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering
 # fixed by https://github.com/microsoft/DeepSpeed/pull/1202
 class GrandPa(torch.nn.Module):
+
     def __init__(self, *args):
         super().__init__(*args)
         self.param_grandpa = torch.nn.Parameter(torch.ones(5))
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class Pa(GrandPa):
+
     def __init__(self, *args):
         super().__init__(*args)
         self.param_pa = torch.nn.Parameter(torch.ones(5))
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_pa.data = (self.param_pa.data + 1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class Son(Pa):
+
     def __init__(self):
         super().__init__()
         self.param = torch.nn.Parameter(torch.ones(5))
         self.param.data = (self.param.data + 1).data  # test param is not yet partitioned
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
+        self.param_pa.data = (self.param_pa.data + 1).data  # test param is not yet partitioned
+        self.param_grandpa.data = (self.param_grandpa.data + 1).data  # test param is not yet partitioned
 
 
 class TestSerialParamInit(DistributedTest):
@@ -98,6 +99,7 @@ class TestDSInitWZinit(DistributedTest):
         }
 
         class Model(torch.nn.Module):
+
             def __init__(self):
                 super(Model, self).__init__()
                 self.linear = torch.nn.Linear(4, 4)
diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py
index 68329cb886c277849319046a59eef6cd022b5d2a..874a8ea3b676338e55360ee6bafc57b879fd708e 100644
--- a/tests/unit/runtime/zero/test_zero_context_return.py
+++ b/tests/unit/runtime/zero/test_zero_context_return.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from types import SimpleNamespace
 import torch
@@ -11,6 +14,7 @@ from unit.common import DistributedTest
 
 
 class DanglingBias(torch.nn.Linear):
+
     def forward(self, *inputs):
         out = super().forward(*inputs)
         # return the bias to trigger a dangling external param
@@ -19,18 +23,21 @@ class DanglingBias(torch.nn.Linear):
 
 class DataClass:
     """Just wraps data in an object. """
+
     def __init__(self, out=None, bias=None):
         self.out = out
         self.bias = bias
 
 
 class DanglingBiasClass(DanglingBias):
+
     def forward(self, *inputs):
         out, bias = super().forward(*inputs)
         return DataClass(out=out, bias=bias)
 
 
 class DanglingAttention(torch.nn.Linear):
+
     def __init__(self, dim=16, return_obj=False):
         super().__init__(dim, dim)
         self.dim = dim
@@ -56,6 +63,7 @@ class DanglingAttention(torch.nn.Linear):
 
 
 class ModelContainer(torch.nn.Module):
+
     def __init__(self, dim=16, return_obj=False):
         super().__init__()
         self.dim = dim
@@ -70,6 +78,7 @@ class ModelContainer(torch.nn.Module):
 
 
 class DanglingExt(torch.nn.Module):
+
     def __init__(self, dim=16):
         super().__init__()
         self.dim = dim
@@ -86,6 +95,7 @@ class DanglingExt(torch.nn.Module):
 
 
 class ModelContainerVariableOutputType(ModelContainer):
+
     def __init__(self, dim=16, output_type=dict):
         super().__init__()
         self.output_type = output_type
@@ -129,10 +139,7 @@ class TestReturnParam(DistributedTest):
         net = DanglingExt()
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
             input = torch.rand(net.dim).to(engine.device).half()
@@ -148,10 +155,7 @@ class TestReturnParam(DistributedTest):
         net = ModelContainer(return_obj=True)
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
             input = torch.rand(net.dim).to(engine.device).half()
@@ -169,10 +173,7 @@ class TestReturnParam(DistributedTest):
         net = ModelContainerVariableOutputType(output_type=output_type)
 
         args = SimpleNamespace(local_rank=0)
-        engine, _, _, _ = deepspeed.initialize(args=args,
-                                                model=net,
-                                                model_parameters=net.parameters(),
-                                                config=config)
+        engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(1):
             input = torch.rand(net.dim).to(engine.device).half()
diff --git a/tests/unit/runtime/zero/test_zero_dynamic_class.py b/tests/unit/runtime/zero/test_zero_dynamic_class.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb57c87f84b84ea1ae82468e36881f5910973bd0
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_dynamic_class.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from unit.common import DistributedTest
+
+import deepspeed
+
+
+class TestNewClassDeclaredInsideInit(DistributedTest):
+    world_size = 1
+
+    def test_new_class_declared_inside_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+
+            class MyModel(torch.nn.Module):
+
+                def __init__(self):
+                    super().__init__()
+                    self.fc = torch.nn.Linear(4, 4)
+
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model = MyModel()
+
+        deepspeed_engine, *_ = deepspeed.initialize(model=model, config_params=ds_config)
+        # ensure that zero3 processed the parameter
+        assert hasattr(deepspeed_engine.fc.weight, "ds_id")
+
+
+class TestNewClassDeclaredInsideInitFailure(DistributedTest):
+    world_size = 1
+
+    def test_new_class_declared_inside_init_failure(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        try:
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+
+                class MyModel(torch.nn.Module):
+
+                    def __init__(self):
+                        super().__init__()
+                        self.fc = torch.nn.Linear(1, 1)
+
+                model = MyModel()
+
+            assert False, "Should have failed. A subclass of torch.nn.Module must be defined before zero.Init() where an instance of the class is created."
+        except RuntimeError as e:
+            pass
+        except:
+            assert False, "Should have failed. Runtime error is expected."
diff --git a/tests/unit/runtime/zero/test_zero_nesting_init.py b/tests/unit/runtime/zero/test_zero_nesting_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b796511cb9cf0a63fec31a050004111d5b6ff5b
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_nesting_init.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from unit.common import DistributedTest
+
+import deepspeed
+
+
+class TestNestingInit(DistributedTest):
+    world_size = 1
+
+    def test_nesting_init(self):
+        ds_config = dict(train_batch_size=1, zero_optimization=dict(stage=3))
+
+        with deepspeed.zero.Init(config_dict_or_path=ds_config):
+            with deepspeed.zero.Init(config_dict_or_path=ds_config):
+                model = torch.nn.Linear(4, 4)
+
+        deepspeed_engine, *_ = deepspeed.initialize(model=model, config_params=ds_config)
+        # ensure that zero3 processed the parameter
+        assert hasattr(deepspeed_engine.weight, "ds_id")
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index 20caf05dd9d5b1d623af38fa8619d6068b502d47..459d41f98eea1079fb0e8b8a098fe581aa8b6ea0 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import pytest
 import deepspeed.comm as dist
@@ -28,18 +31,15 @@ def validate_full_tensors(model):
 
 
 class MyModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, frozen_weights):
         super(MyModel, self).__init__()
         self.act = torch.nn.ReLU()
         self.cel = torch.nn.CrossEntropyLoss()
-        self.linears = torch.nn.ModuleList([
-            torch.nn.Linear(hidden_dim,
-                            1),
-            torch.nn.Linear(1,
-                            1),
-            torch.nn.Linear(1,
-                            hidden_dim)
-        ])
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_dim, 1),
+             torch.nn.Linear(1, 1),
+             torch.nn.Linear(1, hidden_dim)])
         if frozen_weights:
             self.linears[0].weight.requires_grad = False
             self.linears[0].bias.requires_grad = False
@@ -54,9 +54,7 @@ class MyModel(torch.nn.Module):
 
 
 def run_fragmented_model(model, config_dict, hidden_dim, dtype):
-    model, _, _, _ = deepspeed.initialize(model=model,
-                                            model_parameters=model.parameters(),
-                                            config=config_dict)
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=10,
                                     hidden_dim=hidden_dim,
@@ -77,11 +75,7 @@ class TestTensorFragment(DistributedTest):
     world_size = 2
 
     @pytest.mark.parametrize('zero_stage', [1, 2, 3])
-    @pytest.mark.parametrize(
-        'offload_device',
-        [OffloadDeviceEnum.none,
-         OffloadDeviceEnum.cpu,
-         OffloadDeviceEnum.nvme])
+    @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
     def test_zero_fragments(self, tmpdir, zero_stage, offload_device, frozen_weights):
         if offload_device == OffloadDeviceEnum.nvme:
             if zero_stage != 3:
@@ -108,9 +102,7 @@ class TestTensorFragment(DistributedTest):
         }
 
         if offload_device == OffloadDeviceEnum.cpu:
-            config_dict["zero_optimization"]["offload_optimizer"] = {
-                "device": offload_device
-            }
+            config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
         elif offload_device == OffloadDeviceEnum.nvme:
             config_dict["zero_optimization"]["offload_optimizer"] = {
                 "device": offload_device,
diff --git a/tests/unit/runtime/zero/test_zero_tiled.py b/tests/unit/runtime/zero/test_zero_tiled.py
index 5858b59368721d1ca840fcf711b6dcc8e3ec40e6..96b9116126ac16667917dccff0d5b8a81cb0d8ea 100644
--- a/tests/unit/runtime/zero/test_zero_tiled.py
+++ b/tests/unit/runtime/zero/test_zero_tiled.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import copy
 
@@ -120,6 +123,7 @@ class LinearWrapper(torch.nn.Linear):
 
     Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
     """
+
     def forward(self, input):
         out = super().forward(input)
         return out, self.bias
diff --git a/tests/unit/runtime/zero/utils.py b/tests/unit/runtime/zero/utils.py
index 5f0687892d43ac6a78cb51d36ea6c7ad523cad39..ceb594a2a05d835a4a1a5568d20626ae69cf14de 100644
--- a/tests/unit/runtime/zero/utils.py
+++ b/tests/unit/runtime/zero/utils.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 from unit.common import get_master_port
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index 481aae0bfdcd69a62aa5b73f5f8922338cb7546b..330f612e36470b32150410d8ccacc805a3f930ca 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import json
@@ -13,11 +16,10 @@ import deepspeed.comm as dist
 
 
 class SimpleModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
         super(SimpleModel, self).__init__()
-        self.linears = torch.nn.ModuleList(
-            [torch.nn.Linear(hidden_dim,
-                             hidden_dim) for i in range(nlayers)])
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(nlayers)])
         if empty_grad:
             self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
@@ -32,7 +34,30 @@ class SimpleModel(torch.nn.Module):
         return self.cross_entropy_loss(x, y)
 
 
+class SimpleFrozenModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False):
+        super(SimpleFrozenModel, self).__init__()
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for i in range(2)])
+        if empty_grad:
+            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+        self.empty_grad = empty_grad
+        # Freeze first layer
+        self.linears[0].weight.requires_grad = False
+        self.linears[0].bias.requires_grad = False
+
+    def forward(self, x, y):
+        if len(self.linears) == 1:
+            x = self.linears[0](x)
+        else:
+            for i, l in enumerate(self.linears):
+                x = self.linears[i // 2](x) + l(x)
+        return self.cross_entropy_loss(x, y)
+
+
 class Curriculum_SimpleModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(Curriculum_SimpleModel, self).__init__(hidden_dim, empty_grad)
 
@@ -43,6 +68,7 @@ class Curriculum_SimpleModel(SimpleModel):
 
 
 class SimpleMoEModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
         super(SimpleMoEModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -72,6 +98,7 @@ class SimpleMoEModel(torch.nn.Module):
 
 
 class SimplePRMoEModel(torch.nn.Module):
+
     def __init__(self, hidden_dim, num_experts=2, ep_size=1, use_residual=False):
         super(SimplePRMoEModel, self).__init__()
         self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -102,6 +129,7 @@ class SimplePRMoEModel(torch.nn.Module):
 
 
 class UnusedParametersModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super().__init__(hidden_dim, empty_grad)
 
@@ -109,21 +137,19 @@ class UnusedParametersModel(SimpleModel):
 
 
 class LinearStack(torch.nn.Module):
+
     def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
         super().__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
 
-        self.input_layer = torch.nn.Linear(in_features=self.input_dim,
-                                           out_features=self.hidden_dim)
+        self.input_layer = torch.nn.Linear(in_features=self.input_dim, out_features=self.hidden_dim)
         self.layers = torch.nn.ModuleList([
-            torch.nn.Linear(in_features=self.hidden_dim,
-                            out_features=self.hidden_dim,
-                            bias=False) for x in range(num_layers)
+            torch.nn.Linear(in_features=self.hidden_dim, out_features=self.hidden_dim, bias=False)
+            for x in range(num_layers)
         ])
-        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim,
-                                            out_features=self.output_dim)
+        self.output_layer = torch.nn.Linear(in_features=self.hidden_dim, out_features=self.output_dim)
 
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
@@ -136,12 +162,8 @@ class LinearStack(torch.nn.Module):
 
 
 class LinearStackPipe(PipelineModule):
-    def __init__(self,
-                 input_dim=128,
-                 hidden_dim=128,
-                 output_dim=128,
-                 num_layers=4,
-                 **kwargs):
+
+    def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4, **kwargs):
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.hidden_dim = hidden_dim
@@ -150,11 +172,7 @@ class LinearStackPipe(PipelineModule):
         layers = []
         layers.append(LayerSpec(torch.nn.Linear, self.input_dim, self.hidden_dim))
         for x in range(self.num_layers):
-            layers.append(
-                LayerSpec(torch.nn.Linear,
-                          self.hidden_dim,
-                          self.hidden_dim,
-                          bias=False))
+            layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.hidden_dim, bias=False))
             layers.append(lambda x: x)
         layers.append(LayerSpec(torch.nn.Linear, self.hidden_dim, self.output_dim))
 
@@ -162,6 +180,7 @@ class LinearStackPipe(PipelineModule):
 
 
 class SimpleOptimizer(torch.optim.Optimizer):
+
     def __init__(self, params, lr=0.11072018):
         defaults = dict(lr=lr)
         super(SimpleOptimizer, self).__init__(params, defaults)
@@ -185,6 +204,7 @@ class SimpleOptimizer(torch.optim.Optimizer):
 
 
 class HybridStateOptimizer(torch.optim.Optimizer):
+
     def __init__(self, params, lr=0.11072018):
         defaults = dict(lr=lr)
         super(HybridStateOptimizer, self).__init__(params, defaults)
@@ -216,6 +236,7 @@ class HybridStateOptimizer(torch.optim.Optimizer):
 
 
 class PLD_SimpleModel(SimpleModel):
+
     def __init__(self, hidden_dim, empty_grad=False):
         super(PLD_SimpleModel, self).__init__(hidden_dim, empty_grad)
 
@@ -228,9 +249,7 @@ class PLD_SimpleModel(SimpleModel):
 
 def random_dataset(total_samples, hidden_dim, device, dtype=torch.half):
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     return train_dataset
 
@@ -242,21 +261,10 @@ def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half
     return train_loader
 
 
-def sequence_dataloader(model,
-                        total_samples,
-                        hidden_dim,
-                        device,
-                        seq_len: int = 32,
-                        dtype=torch.half):
+def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half):
     batch_size = model.train_micro_batch_size_per_gpu()
-    train_data = torch.randn(total_samples,
-                             seq_len,
-                             hidden_dim,
-                             device=device,
-                             dtype=dtype)
-    train_label = torch.empty(total_samples,
-                              dtype=torch.long,
-                              device=device).random_(hidden_dim)
+    train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype)
+    train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
     return train_loader
diff --git a/tests/unit/test_activation_checkpointing.py b/tests/unit/test_activation_checkpointing.py
deleted file mode 100644
index e66f2abf7408b8982fa2cef13b9657d767883d61..0000000000000000000000000000000000000000
--- a/tests/unit/test_activation_checkpointing.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# TODO: add tests with model parallelism for activation partitioning and other features.
-
-from copy import deepcopy
-
-import pytest
-
-import torch
-
-import deepspeed
-
-ckpt = deepspeed.checkpointing.checkpoint
-
-from .common import distributed_test
-
-
-def _compute(module, *inputs, do_checkpoint=False):
-    if do_checkpoint:
-        outputs = ckpt(module, *inputs)
-    else:
-        outputs = module(*inputs)
-
-    if torch.is_tensor(outputs):
-        outputs = (outputs, )
-
-    sum(o.sum() for o in outputs if torch.is_tensor(o) and o.requires_grad).backward()
-
-    grads = [p.grad for p in module.parameters()]
-    input_grads = [inp.grad for inp in inputs if torch.is_tensor(inp)]
-
-    return {
-        'outputs': outputs,
-        'module_grads': grads,
-        'input_grads': input_grads,
-    }
-
-
-def _prep_inputs(*inputs):
-    _inputs = []
-
-    for inp in inputs:
-        inp = deepcopy(inp)
-        if torch.is_tensor(inp):
-            inp = inp.cuda()
-        _inputs.append(inp)
-
-    return tuple(_inputs)
-
-
-def _match_outputs(ref, tgt):
-    assert type(ref) == type(tgt)
-    if type(ref) in [list, tuple]:
-        for x, y in zip(ref, tgt):
-            _match_outputs(x, y)
-    elif not torch.is_tensor(ref):
-        assert ref == tgt
-    elif ref.is_floating_point():
-        assert torch.allclose(ref, tgt)
-    else:
-        assert torch.equal(ref, tgt)
-
-
-# This is distributed because checkpoint() assumes that torch.distributed is initialized.
-# torch.distributed is used with activation partitioning, but not for these simple cases.
-@distributed_test(world_size=1)
-def _test_activation_checkpoint(module, *inputs):
-    # Move to device
-    module.cuda()
-
-    # Get rid of dropouts until we fork the RNG between tests.
-    module.eval()
-
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    base = _compute(module_, *inputs_, do_checkpoint=False)
-
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    test = _compute(module_, *inputs_, do_checkpoint=True)
-
-    for group in base.keys():
-        for b, t in zip(base[group], test[group]):
-            _match_outputs(b, t)
-
-
-# This is distributed because checkpoint() assumes that torch.distributed is initialized.
-# torch.distributed is used with activation partitioning, but not for these simple cases.
-@distributed_test(world_size=1)
-def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
-    # Move to device
-    module.cuda()
-
-    # Get rid of dropouts until we fork the RNG between tests.
-    module.eval()
-
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    test = _compute(module_, *inputs_, do_checkpoint=True)
-
-    outputs = test['outputs']
-    test_ordering = []
-    for item in outputs:
-        if type(item) in [list, tuple]:
-            test_ordering += [torch.is_tensor(t) for t in item]
-        else:
-            test_ordering += [torch.is_tensor(item)]
-
-    assert expected_ordering == test_ordering
-
-
-#
-# Helpers
-#
-
-
-class MaskedLinear(torch.nn.Linear):
-    def forward(self, x, mask):
-        out = super().forward(x)
-        if mask.is_floating_point():
-            out = out * mask
-        else:
-            # must cast BoolTensor in older torch versions
-            out = out * mask.type_as(out)
-        return out
-
-
-class MaskedLinearSeq(MaskedLinear):
-    """Tests pipeline modules by also returning the mask."""
-    def forward(self, x, mask):
-        return super().forward(x, mask), mask
-
-
-class MaskedLinearSeqDup(MaskedLinearSeq):
-    """MaskedLinearSeq, but with more outputs than inputs and in a different order."""
-    def forward(self, x, mask):
-        dup = x.clone().detach() * 1.38  # just an arbitrary scaling
-        x, mask = super().forward(x, mask)
-        return dup, x, mask
-
-
-HIDDEN_DIM = 20
-
-
-def _mixed_mask(size=HIDDEN_DIM):
-    entries = torch.randn(size)
-    mask = torch.where(entries > 0, torch.ones(size), torch.zeros(size))
-    mask = mask.bool()
-    return mask
-
-
-def _bool_to_float(btensor, dtype=torch.float32):
-    """Converts a torch.BoolTensor to an equivalent dtype. """
-    ones = torch.ones(size=btensor.size(), dtype=dtype)
-    zeros = torch.zeros(size=btensor.size(), dtype=dtype)
-    return torch.where(btensor, ones, zeros)
-
-
-#
-# Tests
-#
-
-
-def test_ckpt_inputs1_outputs1():
-    module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs)
-
-
-# both bool and float are important, as bool is not differentiable
-@pytest.mark.parametrize('mask',
-                         [
-                             _mixed_mask(),
-                             _bool_to_float(_mixed_mask()),
-                         ])
-def test_ckpt_inputs2_outputs1(mask):
-    module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs, mask)
-
-
-@pytest.mark.parametrize('mask',
-                         [
-                             _mixed_mask(),
-                             _bool_to_float(_mixed_mask()),
-                         ])
-def test_ckpt_inputs2_outputs2(mask):
-    module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs, mask)
-
-
-@pytest.mark.parametrize('mask',
-                         [
-                             _mixed_mask(),
-                             _bool_to_float(_mixed_mask()),
-                         ])
-def test_ckpt_inputs2_outputs3(mask):
-    module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs, mask)
-
-
-class DropMaskLinear(torch.nn.Linear):
-    def forward(self, x, mask):
-        return super().forward(x)
-
-
-def test_ckpt_arg_none():
-    module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = (torch.rand(HIDDEN_DIM), None)
-    inputs[0].requires_grad = True
-    _test_activation_checkpoint(module, *inputs)
-
-
-class LinearNonTensorInput(torch.nn.Linear):
-    def forward(self, x, non_tensor_input):
-        return super().forward(x)
-
-
-@pytest.mark.parametrize(
-    'non_tensor_input',
-    [None,
-     2,
-     True,
-     (None,
-      2.5),
-     (None,
-      True,
-      torch.randn(HIDDEN_DIM))])
-def test_ckpt_non_tensor_input(non_tensor_input):
-    module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs, non_tensor_input)
-
-
-class LinearNonTensorOutput(torch.nn.Linear):
-    def __init__(self, non_tensor_output):
-        super().__init__(HIDDEN_DIM, HIDDEN_DIM)
-        self.non_tensor_output = non_tensor_output
-
-    def forward(self, x):
-        out = super().forward(x)
-        return out, self.non_tensor_output
-
-
-@pytest.mark.parametrize(
-    'non_tensor_output',
-    [None,
-     2,
-     True,
-     (None,
-      2.5),
-     (None,
-      True,
-      torch.randn(HIDDEN_DIM))])
-def test_ckpt_non_tensor_output(non_tensor_output):
-    module = LinearNonTensorOutput(non_tensor_output)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-    _test_activation_checkpoint(module, inputs)
-
-
-@pytest.mark.parametrize('non_tensor_output',
-                         [
-                             None,
-                             (torch.randn(HIDDEN_DIM),
-                              2.5),
-                             (None,
-                              torch.randn(HIDDEN_DIM),
-                              True),
-                             (None,
-                              True,
-                              torch.randn(HIDDEN_DIM))
-                         ])
-def test_ckpt_non_tensor_output_ordering(non_tensor_output):
-    module = LinearNonTensorOutput(non_tensor_output)
-    inputs = torch.rand(HIDDEN_DIM)
-    inputs.requires_grad = True
-
-    # First return is a tensor
-    ordering = [True]
-    if type(non_tensor_output) in [list, tuple]:
-        ordering += [torch.is_tensor(t) for t in non_tensor_output]
-    else:
-        ordering += [torch.is_tensor(non_tensor_output)]
-    _test_activation_checkpoint_ordering(module, ordering, inputs)
diff --git a/tests/unit/test_adamw.py b/tests/unit/test_adamw.py
deleted file mode 100644
index b4bfbf3c260c5286560b117ba5d7b545ffe6259d..0000000000000000000000000000000000000000
--- a/tests/unit/test_adamw.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import deepspeed
-import torch
-import pytest
-
-from deepspeed.ops.adam import FusedAdam
-from deepspeed.ops.adam import DeepSpeedCPUAdam
-from .common import distributed_test
-from .simple_model import SimpleModel, args_from_dict
-
-# yapf: disable
-#'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer
-adam_configs = [["AdamW", False, False, False, (FusedAdam, True)],
-                ["AdamW", False, True,  False, (torch.optim.AdamW, None)],
-                ["AdamW", True,  False, False, (DeepSpeedCPUAdam, True)],
-                ["AdamW", True,  True,  False, (torch.optim.AdamW, None)],
-                ["AdamW", False, False, True,  (FusedAdam, True)],
-                ["AdamW", False, True,  True,  (torch.optim.AdamW, None)],
-                ["AdamW", True,  False, True,  (DeepSpeedCPUAdam, True)],
-                ["AdamW", True,  True,  True,  (torch.optim.AdamW, None)],
-                ["Adam",  False, False, False, (FusedAdam, False)],
-                ["Adam",  False, True,  False, (torch.optim.Adam, None)],
-                ["Adam",  True,  False, False, (DeepSpeedCPUAdam, False)],
-                ["Adam",  True,  True,  False, (torch.optim.Adam, None)],
-                ["Adam",  False, False, True,  (FusedAdam, True)],
-                ["Adam",  False, True,  True,  (torch.optim.AdamW, None)],
-                ["Adam",  True,  False, True,  (DeepSpeedCPUAdam, True)],
-                ["Adam",  True,  True,  True,  (torch.optim.AdamW, None)]]
-
-@pytest.mark.parametrize(
-    'optimizer, zero_offload, torch_adam, adam_w_mode, resulting_optimizer',
-    adam_configs)
-def test_adam_configs(tmpdir,
-                      optimizer,
-                      zero_offload,
-                      torch_adam,
-                      adam_w_mode,
-                      resulting_optimizer):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": optimizer,
-            "params": {
-                "lr": 0.00015,
-                "torch_adam": torch_adam,
-                "adam_w_mode": adam_w_mode
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 2,
-            "cpu_offload": zero_offload
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def helper(args):
-        model = SimpleModel(10)
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        # get base optimizer under zero
-        ds_optimizer = model.optimizer.optimizer
-        opt_class, adam_w_mode = resulting_optimizer
-        assert isinstance(ds_optimizer, opt_class)
-        if adam_w_mode in [True, False]:
-            assert ds_optimizer.adam_w_mode == adam_w_mode
-
-    helper(args)
diff --git a/tests/unit/test_aio.py b/tests/unit/test_aio.py
deleted file mode 100644
index fdec95a35ae79ab109be1df97e497bae8c46f219..0000000000000000000000000000000000000000
--- a/tests/unit/test_aio.py
+++ /dev/null
@@ -1,335 +0,0 @@
-import pytest
-import os
-import filecmp
-import torch
-import deepspeed
-import torch.distributed as dist
-from deepspeed.ops.aio import AsyncIOBuilder
-from .common import distributed_test
-
-MEGA_BYTE = 1024**2
-BLOCK_SIZE = MEGA_BYTE
-QUEUE_DEPTH = 2
-IO_SIZE = 16 * MEGA_BYTE
-IO_PARALLEL = 2
-
-
-def _skip_if_no_aio():
-    if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
-        pytest.skip('Skip tests since async-io is not compatible')
-
-
-def _do_ref_write(tmpdir, index=0):
-    file_suffix = f'{dist.get_rank()}_{index}'
-    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
-    with open(ref_file, 'wb') as f:
-        f.write(ref_buffer)
-
-    return ref_file, ref_buffer
-
-
-def _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device, index=0):
-    file_suffix = f'{dist.get_rank()}_{index}'
-    test_file = os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
-    if cuda_device:
-        test_buffer = torch.cuda.ByteTensor(list(ref_buffer))
-    else:
-        test_buffer = torch.ByteTensor(list(ref_buffer)).pin_memory()
-
-    return test_file, test_buffer
-
-
-def _validate_handle_state(handle, single_submit, overlap_events):
-    assert handle.get_single_submit() == single_submit
-    assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
-    assert handle.get_block_size() == BLOCK_SIZE
-    assert handle.get_queue_depth() == QUEUE_DEPTH
-
-
-@pytest.mark.parametrize('single_submit, overlap_events',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
-def test_parallel_read(tmpdir, single_submit, overlap_events):
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_parallel_read(single_submit, overlap_events):
-        ref_file, _ = _do_ref_write(tmpdir)
-
-        aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        read_status = h.sync_pread(aio_buffer, ref_file)
-        assert read_status == 1
-
-        with open(ref_file, 'rb') as f:
-            ref_buffer = list(f.read())
-        assert ref_buffer == aio_buffer.tolist()
-
-    _test_parallel_read(single_submit, overlap_events)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
-                         [(False,
-                           False,
-                           False),
-                          (False,
-                           True,
-                           False),
-                          (True,
-                           False,
-                           False),
-                          (True,
-                           True,
-                           False),
-                          (False,
-                           False,
-                           True),
-                          (True,
-                           True,
-                           True)])
-def test_async_read(tmpdir, single_submit, overlap_events, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_read(single_submit, overlap_events, cuda_device):
-        ref_file, _ = _do_ref_write(tmpdir)
-
-        if cuda_device:
-            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-        else:
-            aio_buffer = torch.empty(IO_SIZE,
-                                     dtype=torch.uint8,
-                                     device='cpu').pin_memory()
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        read_status = h.async_pread(aio_buffer, ref_file)
-        assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == 1
-
-        with open(ref_file, 'rb') as f:
-            ref_buffer = list(f.read())
-        assert ref_buffer == aio_buffer.tolist()
-
-    _test_async_read(single_submit, overlap_events, cuda_device)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
-def test_parallel_write(tmpdir, single_submit, overlap_events):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_parallel_write(single_submit, overlap_events):
-        ref_file, ref_buffer = _do_ref_write(tmpdir)
-
-        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, False)
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        write_status = h.sync_pwrite(aio_buffer, aio_file)
-        assert write_status == 1
-
-        assert os.path.isfile(aio_file)
-
-        filecmp.clear_cache()
-        assert filecmp.cmp(ref_file, aio_file, shallow=False)
-
-    _test_parallel_write(single_submit, overlap_events)
-
-
-@pytest.mark.parametrize('single_submit, overlap_events, cuda_device',
-                         [(False,
-                           False,
-                           False),
-                          (False,
-                           True,
-                           False),
-                          (True,
-                           False,
-                           False),
-                          (True,
-                           True,
-                           False),
-                          (False,
-                           False,
-                           True),
-                          (True,
-                           True,
-                           True)])
-def test_async_write(tmpdir, single_submit, overlap_events, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_write(single_submit, overlap_events, cuda_device):
-        ref_file, ref_buffer = _do_ref_write(tmpdir)
-
-        aio_file, aio_buffer = _get_test_file_and_buffer(tmpdir, ref_buffer, cuda_device)
-
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        write_status = h.async_pwrite(aio_buffer, aio_file)
-        assert write_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == 1
-
-        assert os.path.isfile(aio_file)
-
-        filecmp.clear_cache()
-        assert filecmp.cmp(ref_file, aio_file, shallow=False)
-
-    _test_async_write(single_submit, overlap_events, cuda_device)
-
-
-@pytest.mark.parametrize('async_queue, cuda_device',
-                         [(2,
-                           False),
-                          (4,
-                           False),
-                          (2,
-                           True),
-                          (4,
-                           True)])
-def test_async_queue_read(tmpdir, async_queue, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_queue_read(async_queue, cuda_device):
-        ref_files = []
-        for i in range(async_queue):
-            f, _ = _do_ref_write(tmpdir, i)
-            ref_files.append(f)
-
-        aio_buffers = []
-        for i in range(async_queue):
-            if cuda_device:
-                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cuda')
-            else:
-                buf = torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu').pin_memory()
-            aio_buffers.append(buf)
-
-        single_submit = True
-        overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        for i in range(async_queue):
-            read_status = h.async_pread(aio_buffers[i], ref_files[i])
-            assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == async_queue
-
-        for i in range(async_queue):
-            with open(ref_files[i], 'rb') as f:
-                ref_buffer = list(f.read())
-            assert ref_buffer == aio_buffers[i].tolist()
-
-    _test_async_queue_read(async_queue, cuda_device)
-
-
-@pytest.mark.parametrize('async_queue, cuda_device',
-                         [(2,
-                           False),
-                          (7,
-                           False),
-                          (2,
-                           True),
-                          (7,
-                           True)])
-def test_async_queue_write(tmpdir, async_queue, cuda_device):
-
-    _skip_if_no_aio()
-
-    @distributed_test(world_size=[2])
-    def _test_async_queue_write(async_queue, cuda_device):
-        ref_files = []
-        ref_buffers = []
-        for i in range(async_queue):
-            f, buf = _do_ref_write(tmpdir, i)
-            ref_files.append(f)
-            ref_buffers.append(buf)
-
-        aio_files = []
-        aio_buffers = []
-        for i in range(async_queue):
-            f, buf = _get_test_file_and_buffer(tmpdir, ref_buffers[i], cuda_device, i)
-            aio_files.append(f)
-            aio_buffers.append(buf)
-
-        single_submit = True
-        overlap_events = True
-        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE,
-                                               QUEUE_DEPTH,
-                                               single_submit,
-                                               overlap_events,
-                                               IO_PARALLEL)
-
-        _validate_handle_state(h, single_submit, overlap_events)
-
-        for i in range(async_queue):
-            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
-            assert read_status == 0
-
-        wait_status = h.wait()
-        assert wait_status == async_queue
-
-        for i in range(async_queue):
-            assert os.path.isfile(aio_files[i])
-
-            filecmp.clear_cache()
-            assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
-
-    _test_async_queue_write(async_queue, cuda_device)
diff --git a/tests/unit/test_autocast.py b/tests/unit/test_autocast.py
deleted file mode 100644
index 004cd8533869e4a1fab7c6cf50cfa784539ca008..0000000000000000000000000000000000000000
--- a/tests/unit/test_autocast.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import pytest
-import torch
-import deepspeed
-from deepspeed.runtime.zero.linear import LinearModuleForZeroStage3
-
-
-def _skip_autocast_test():
-    try:
-        from torch.cuda.amp import custom_fwd, custom_bwd
-    except (ImportError, AttributeError) as exp:
-        return True
-
-    return False
-
-
-@pytest.mark.parametrize('half_op', [False, True])
-def test_missing_amp_autocast(tmpdir, half_op):
-    hidden_dim = 4
-    if half_op:
-        input = torch.randn(hidden_dim).cuda().half()
-        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half()
-    else:
-        input = torch.randn(hidden_dim).cuda()
-        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
-
-    output = ds_linear(input)
-    assert output.dtype == ds_linear.weight.dtype
-
-
-@pytest.mark.parametrize('half_op', [False, True])
-def test_disable_autocast_linear(tmpdir, half_op):
-    if _skip_autocast_test():
-        pytest.skip("amp autocast is not available")
-
-    hidden_dim = 4
-    if half_op:
-        input = torch.randn(hidden_dim).cuda().half()
-        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda().half()
-    else:
-        input = torch.randn(hidden_dim).cuda()
-        ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
-
-    with torch.cuda.amp.autocast(False):
-        output = ds_linear(input)
-        assert output.dtype == ds_linear.weight.dtype
-
-
-@pytest.mark.parametrize('half_input, half_weight',
-                         [(False,
-                           False),
-                          (False,
-                           True),
-                          (True,
-                           False),
-                          (True,
-                           True)])
-def test_autocast_linear(tmpdir, half_input, half_weight):
-    if _skip_autocast_test():
-        pytest.skip("amp autocast is not available")
-
-    hidden_dim = 4
-    input = torch.randn(hidden_dim).cuda()
-    ds_linear = LinearModuleForZeroStage3(hidden_dim, hidden_dim).cuda()
-
-    if half_input:
-        input = input.half()
-
-    if half_weight:
-        ds_linear = ds_linear.half()
-
-    with torch.cuda.amp.autocast():
-        output = ds_linear(input)
-        assert output.dtype == torch.half
diff --git a/tests/unit/test_autotuning.py b/tests/unit/test_autotuning.py
deleted file mode 100644
index 2a7898b8af0a21159c704d295a85aaa071db8dd2..0000000000000000000000000000000000000000
--- a/tests/unit/test_autotuning.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import os
-import pytest
-import torch
-from .simple_model import create_config_from_dict
-from deepspeed.launcher import runner as dsrun
-from deepspeed.autotuning.autotuner import Autotuner
-from deepspeed.autotuning.scheduler import ResourceManager
-
-RUN_OPTION = 'run'
-TUNE_OPTION = 'tune'
-
-
-def test_command_line():
-    '''Validate handling of command line arguments'''
-    for opt in [RUN_OPTION, TUNE_OPTION]:
-        dsrun.parse_args(
-            args=f"--num_nodes 1 --num_gpus 1 --autotuning {opt} foo.py".split())
-
-    for error_opts in [
-            "--autotuning --num_nodes 1 --num_gpus 1 foo.py".split(),
-            "--autotuning test --num_nodes 1 -- num_gpus 1 foo.py".split(),
-            "--autotuning".split()
-    ]:
-        with pytest.raises(SystemExit):
-            dsrun.parse_args(args=error_opts)
-
-
-@pytest.mark.parametrize("arg_mappings",
-                        [
-                            None,
-                            {
-                            },
-                            {
-                                "train_micro_batch_size_per_gpu": "--per_device_train_batch_size"
-                            },
-                            {
-                                "train_micro_batch_size_per_gpu": "--per_device_train_batch_size",
-                                "gradient_accumulation_steps": "--gradient_accumulation_steps"
-                            },
-                            {
-                                "train_batch_size": "-tbs"
-                            }
-                        ]) # yapf: disable
-def test_resource_manager_arg_mappings(arg_mappings):
-    rm = ResourceManager(args=None,
-                         hosts="worker-0, worker-1",
-                         num_gpus_per_node=4,
-                         results_dir=None,
-                         exps_dir=None,
-                         arg_mappings=arg_mappings)
-
-    if arg_mappings is not None:
-        for k, v in arg_mappings.items():
-            assert k.strip() in rm.arg_mappings.keys()
-            assert arg_mappings[k.strip()].strip() == rm.arg_mappings[k.strip()]
-
-
-@pytest.mark.parametrize("active_resources",
-                        [
-                           {"worker-0": [0, 1, 2, 3]},
-                           {"worker-0": [0, 1, 2, 3], "worker-1": [0, 1, 2, 3]},
-                           {"worker-0": [0], "worker-1": [0, 1, 2], "worker-2": [0, 1, 2]},
-                           {"worker-0": [0, 1], "worker-2": [4, 5]}
-                        ]
-                        ) # yapf: disable
-def test_autotuner_resources(tmpdir, active_resources):
-    config_dict = {
-        "autotuning": {
-            "enabled": True,
-            "exps_dir": os.path.join(tmpdir,
-                                     'exps_dir'),
-            "arg_mappings": {}
-        }
-    }
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    args = dsrun.parse_args(
-        args=f'--autotuning {TUNE_OPTION} foo.py --deepspeed_config {config_path}'.split(
-        ))
-    tuner = Autotuner(args=args, active_resources=active_resources)
-
-    expected_num_nodes = len(list(active_resources.keys()))
-    assert expected_num_nodes == tuner.exp_num_nodes
-
-    expected_num_gpus = min([len(v) for v in active_resources.values()])
-    assert expected_num_gpus == tuner.exp_num_gpus
diff --git a/tests/unit/test_bf16.py b/tests/unit/test_bf16.py
deleted file mode 100644
index aa2ab132394c56d069cfdfe752b167c903f901dc..0000000000000000000000000000000000000000
--- a/tests/unit/test_bf16.py
+++ /dev/null
@@ -1,321 +0,0 @@
-import math
-import torch
-import deepspeed
-import pytest
-from deepspeed.ops.adam import FusedAdam
-from .common import distributed_test
-from deepspeed.ops.op_builder import CPUAdamBuilder
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-from .util import bf16_required_version_check
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
-def test_adam_bf16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "scheduler": {
-            "type": "OneCycle",
-            "params": {
-                "cycle_first_step_size": 16000,
-                "cycle_first_stair_count": 8000,
-                "decay_step_size": 16000,
-                "cycle_min_lr": 1e-06,
-                "cycle_max_lr": 3e-05,
-                "decay_lr_rate": 1e-07,
-                "cycle_min_mom": 0.85,
-                "cycle_max_mom": 0.99,
-                "decay_mom_rate": 0.0
-            }
-        },
-        "fp16": {
-            "enabled": False
-        },
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _test_adam_bf16_zero_onecycle_compatibility(args, zero_stage, hidden_dim):
-        model = SimpleModel(hidden_dim)
-
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.bfloat16)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_bf16_zero_onecycle_compatibility(args=args,
-                                                zero_stage=zero_stage,
-                                                hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
-def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 4,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": False,
-        },
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        },
-        "zero_allow_untested_optimizer": False
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_zero_allow_untested_optimizer(args, zero_stage):
-        hidden_dim = 10
-        model = SimpleModel(hidden_dim)
-        optimizer = SimpleOptimizer(model.parameters())
-        with pytest.raises(AssertionError):
-            model, optim, _, _ = deepspeed.initialize(args=args,
-                                                      model=model,
-                                                      optimizer=optimizer,
-                                                      model_parameters=model.parameters())
-
-    _test_zero_allow_untested_optimizer(args, zero_stage)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload', [(2, False)])
-def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    if zero_stage == 3:
-        pytest.skip("skip for now")
-
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "fp16": {
-            "enabled": False
-        },
-        "bf16": {
-            "enabled": True
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload,
-            "reduce_bucket_size": 100,
-            "allgather_bucket_size": 100
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[3])
-    def _test_zero_empty_partition(args, zero_stage):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-
-        # Ensure model has 2 parameters, to cause empty partition with DP=3
-        assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.bfloat16)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zero_empty_partition(args=args, zero_stage=zero_stage)
-
-
-@pytest.mark.parametrize('zero_stage, optimizer_constructor',
-                         [(2,
-                           torch.optim.Adam),
-                          (2,
-                           FusedAdam)])
-def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": False
-        },
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
-        model = SimpleModel(hidden_dim)
-
-        client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=client_optimizer)
-
-    _test_zero_supported_client_optimizer(args=args,
-                                          zero_stage=zero_stage,
-                                          optimizer_constructor=optimizer_constructor)
-
-
-def test_zero2_reduce_scatter_off(tmpdir):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "zero_optimization": {
-            "stage": 2,
-            "contiguous_gradients": True,
-            "allgather_bucket_size": 2000000000,
-            "reduce_bucket_size": 200000000,
-            "overlap_comm": False,
-            "reduce_scatter": False
-        },
-        "fp16": {
-            "enabled": False
-        },
-        "bf16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _helper(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.bfloat16)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _helper(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('stage', [2])
-def test_zero_empty_grad(tmpdir, stage):
-    if not bf16_required_version_check():
-        pytest.skip(
-            " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
-        )
-
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": False
-        },
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": stage
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _go(args, model, hidden_dim):
-        optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.bfloat16)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _go(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_checkpointing.py b/tests/unit/test_checkpointing.py
deleted file mode 100644
index c989f226cf2d8884ad6687db7f581558879df984..0000000000000000000000000000000000000000
--- a/tests/unit/test_checkpointing.py
+++ /dev/null
@@ -1,1403 +0,0 @@
-import torch
-
-import torch.distributed as dist
-
-import deepspeed
-from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
-from deepspeed.utils import groups
-from deepspeed.runtime.fp16.fused_optimizer import FP16_Optimizer
-from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
-from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
-
-from deepspeed.runtime.pipe.topology import *
-
-PipeTopo = PipeDataParallelTopology
-
-from deepspeed.ops.op_builder import FusedLambBuilder, CPUAdamBuilder
-
-from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-from .util import required_torch_version
-
-import itertools
-import argparse
-import pytest
-import json
-import os
-import numbers
-from .common import distributed_test
-from .simple_model import *
-
-
-def compare_deepspeed_states(saved_model, loaded_model):
-    # These are compared in more depth in other places
-    assert hasattr(loaded_model, 'module')
-
-    assert saved_model.sparse_tensor_module_names == loaded_model.sparse_tensor_module_names
-    assert saved_model.skipped_steps == loaded_model.skipped_steps
-    assert saved_model.global_steps == loaded_model.global_steps
-
-
-def compare_model_states(saved_model,
-                         loaded_model,
-                         compare_optimizer=True,
-                         load_module_only=False):
-    if not load_module_only:
-        compare_deepspeed_states(saved_model, loaded_model)
-
-    for p0, p1 in zip(saved_model.module.named_parameters(), loaded_model.module.named_parameters()):
-        np0, p0 = p0
-        np1, p1 = p1
-        if 'deepspeed_moe.gate.wg' in np0:
-            # these params are converted to float at runtime, cast to half for comparison
-            p1 = p1.half()
-            p0 = p0.half()
-        assert id(p0) != id(p1), f'Comparing fp16 model state tensor against itself : {id(p0)} <====> {id(p1)}'
-        try:
-            assert torch.allclose(p0, p1, atol=1e-07), f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}"
-        except RuntimeError as err:
-            print(f"FP16 model state {p0} is not equal to {p1}, names:{np0}, {np1}")
-            raise err
-
-    if not compare_optimizer:
-        return
-
-    if DeepSpeedZeroOptimizer_Stage3 is not None and isinstance(
-            saved_model.optimizer,
-            DeepSpeedZeroOptimizer_Stage3):
-        for p0, p1 in zip(saved_model.optimizer.fp32_partitioned_groups_flat, loaded_model.optimizer.fp32_partitioned_groups_flat):
-            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
-
-    elif isinstance(saved_model.optimizer, DeepSpeedZeroOptimizer):
-        for p0, p1 in zip(saved_model.optimizer.single_partition_of_fp32_groups, loaded_model.optimizer.single_partition_of_fp32_groups):
-            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
-            assert torch.allclose(p0, p1, atol=1e-07), f"Fp32 model states {p0} is not equal to {p1}"
-
-    elif isinstance(saved_model.optimizer, FP16_Optimizer):
-        for p0, p1 in zip(saved_model.optimizer.fp32_groups_flat, loaded_model.optimizer.fp32_groups_flat):
-            assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
-            assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
-
-    elif isinstance(saved_model.optimizer, FP16_UnfusedOptimizer):
-        for params0, params1 in zip(saved_model.optimizer.fp32_groups, loaded_model.optimizer.fp32_groups):
-            for p0, p1 in zip(params0, params1):
-                assert id(p0) != id(p1), f'Comparing fp32 model state tensor against itself: {id(p0)} <====> {id(p1)}'
-                assert torch.allclose(p0, p1, atol=1e-07), f"FP32 model states {p0} is not equal to {p1}"
-    elif isinstance(saved_model.optimizer, torch.optim.Optimizer):
-        pass
-    else:
-        assert False, f'Unexpected Optimizer Type: {saved_model.optimizer}'
-
-
-def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
-    saved_optimizer = saved_model.optimizer.optimizer if fp16 else saved_model.optimizer
-    loaded_optimizer = loaded_model.optimizer.optimizer if fp16 else loaded_model.optimizer
-
-    for state0, state1 in zip(saved_optimizer.state.values(),
-                              loaded_optimizer.state.values()):
-        for s0, s1 in zip(state0.values(), state1.values()):
-            if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor):
-                assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}'
-                assert torch.equal(s0, s1)
-            else:
-                assert s0 == s1
-
-
-def compare_lr_scheduler_states(saved_model, loaded_model):
-    assert hasattr(saved_model, 'lr_scheduler')
-    assert hasattr(loaded_model, 'lr_scheduler')
-
-    saved_scheduler = saved_model.lr_scheduler
-    loaded_scheduler = loaded_model.lr_scheduler
-
-    assert hasattr(saved_scheduler, 'state_dict')
-    assert hasattr(loaded_scheduler, 'state_dict')
-
-    saved_sd = saved_scheduler.state_dict()
-    loaded_sd = loaded_scheduler.state_dict()
-
-    print(f"saved_sd = {saved_sd}")
-    print(f"loaded_sd = {loaded_sd}")
-
-    assert saved_sd.keys() == loaded_sd.keys()
-
-    for state0, state1 in zip(saved_sd.values(), loaded_sd.values()):
-        if isinstance(state0, numbers.Number) and isinstance(state1, numbers.Number):
-            assert state0 == state1
-
-
-def create_deepspeed_model(args, model, base_optimizer):
-    if base_optimizer is None:
-        ds_model, _, _, _ = deepspeed.initialize(args=args,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
-    else:
-        ds_model, _, _, _ = deepspeed.initialize(args=args,
-                                                model=model,
-                                                optimizer=base_optimizer)
-
-    return ds_model
-
-
-def checkpoint_correctness_verification(args,
-                                        models,
-                                        hidden_dim,
-                                        tmpdir,
-                                        load_optimizer_states=False,
-                                        load_lr_scheduler_states=False,
-                                        fp16=True,
-                                        train_batch=False,
-                                        base_optimizers=[None,
-                                                         None],
-                                        empty_tag=False,
-                                        seq_dataloader=False,
-                                        load_module_only=False):
-    dtype = torch.half if fp16 else torch.float32
-    ds_model = create_deepspeed_model(args=args,
-                                      model=models[0],
-                                      base_optimizer=base_optimizers[0])
-
-    if seq_dataloader:
-        data_loader = sequence_dataloader(model=ds_model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=ds_model.device,
-                                          dtype=dtype)
-    else:
-        data_loader = random_dataloader(model=ds_model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=ds_model.device,
-                                        dtype=dtype)
-
-    if train_batch:
-        ds_model.set_dataloader(data_loader)
-        for n, batch in enumerate(data_loader):
-            loss = ds_model.train_batch()
-    else:
-        for n, batch in enumerate(data_loader):
-            loss = ds_model(batch[0], batch[1])
-            ds_model.backward(loss)
-            ds_model.step()
-
-    trained_model = ds_model
-
-    save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-    save_tag = None if empty_tag else '1'
-
-    trained_model.save_checkpoint(save_folder, tag=save_tag)
-
-    dist.barrier()
-
-    loaded_model = create_deepspeed_model(args=args,
-                                          model=models[1],
-                                          base_optimizer=base_optimizers[1])
-    assert list(trained_model.parameters())[0].dtype == list(
-        loaded_model.parameters())[0].dtype
-
-    loaded_model.load_checkpoint(save_folder,
-                                 tag=save_tag,
-                                 load_optimizer_states=load_optimizer_states,
-                                 load_lr_scheduler_states=load_lr_scheduler_states,
-                                 load_module_only=load_module_only)
-
-    compare_model_states(trained_model,
-                         loaded_model,
-                         compare_optimizer=load_optimizer_states,
-                         load_module_only=load_module_only)
-
-    if load_optimizer_states:
-        compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16)
-
-    if load_lr_scheduler_states:
-        compare_lr_scheduler_states(trained_model, loaded_model)
-
-
-@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
-                    reason="lamb is not compatible")
-def test_checkpoint_unfused_optimizer(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True
-        },
-        "scheduler": {
-            "type": "OneCycle",
-            "params": {
-                "cycle_first_step_size": 1000,
-                "cycle_first_stair_count": 500,
-                "cycle_second_step_size": 1000,
-                "cycle_second_stair_count": 500,
-                "decay_step_size": 1000,
-                "cycle_min_lr": 0.0001,
-                "cycle_max_lr": 0.0010,
-                "decay_lr_rate": 0.001,
-                "cycle_min_mom": 0.85,
-                "cycle_max_mom": 0.99,
-                "decay_mom_rate": 0.0
-            }
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_unfused_optimizer(args,
-                                           models,
-                                           hidden_dim,
-                                           load_optimizer_states):
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=load_optimizer_states)
-
-    _test_checkpoint_unfused_optimizer(args=args,
-                                       models=models,
-                                       hidden_dim=hidden_dim,
-                                       load_optimizer_states=True)
-
-    _test_checkpoint_unfused_optimizer(args=args,
-                                       models=models,
-                                       hidden_dim=hidden_dim,
-                                       load_optimizer_states=False)
-
-
-def test_checkpoint_fused_optimizer(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_fused_optimizer(args,
-                                         models,
-                                         hidden_dim,
-                                         load_optimizer_states):
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=load_optimizer_states)
-
-    _test_checkpoint_fused_optimizer(args=args,
-                                     models=models,
-                                     hidden_dim=hidden_dim,
-                                     load_optimizer_states=True)
-
-    _test_checkpoint_fused_optimizer(args=args,
-                                     models=models,
-                                     hidden_dim=hidden_dim,
-                                     load_optimizer_states=False)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                         [(1,
-                           False,
-                           'Adam'),
-                          (2,
-                           False,
-                           'Adam'),
-                          (2,
-                           True,
-                           'deepspeed_adam'),
-                          (3,
-                           False,
-                           'Adam'),
-                          (3,
-                           True,
-                           'deepspeed_adam')])
-def test_checkpoint_zero_optimizer(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "wall_clock_breakdown": True,
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_zero_optimizer(args,
-                                        zero_stage,
-                                        hidden_dim,
-                                        load_optimizer_states):
-        if zero_stage == 3:
-            with deepspeed.zero.Init():
-                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-        else:
-            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-        checkpoint_correctness_verification(args,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=load_optimizer_states)
-
-    _test_checkpoint_zero_optimizer(args=args,
-                                    zero_stage=zero_stage,
-                                    hidden_dim=hidden_dim,
-                                    load_optimizer_states=True)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                         [(1,
-                           False,
-                           "Adam"),
-                          (2,
-                           False,
-                           "Adam"),
-                          (2,
-                           True,
-                           'deepspeed_adam'),
-                          (3,
-                           False,
-                           'Adam'),
-                          (3,
-                           True,
-                           'deepspeed_adam')])
-def test_checkpoint_zero_no_optimizer(tmpdir,
-                                      zero_stage,
-                                      use_cpu_offload,
-                                      adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _test_checkpoint_zero_no_optimizer(args,
-                                           zero_stage,
-                                           hidden_dim,
-                                           load_optimizer_states):
-        if zero_stage == 3:
-            global DeepSpeedZeroOptimizer_Stage3
-            from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.zero.Init():
-                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-        else:
-            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-        checkpoint_correctness_verification(args,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_optimizer_states=load_optimizer_states)
-
-    _test_checkpoint_zero_no_optimizer(args=args,
-                                       zero_stage=zero_stage,
-                                       hidden_dim=hidden_dim,
-                                       load_optimizer_states=False)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                         [(0,
-                           False,
-                           'Adam'),
-                          (1,
-                           False,
-                           'Adam'),
-                          (2,
-                           False,
-                           'Adam'),
-                          (2,
-                           True,
-                           'deepspeed_adam'),
-                          (3,
-                           False,
-                           'Adam'),
-                          (3,
-                           True,
-                           'deepspeed_adam')])
-def test_checkpoint_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        },
-        "scheduler": {
-            "type": "WarmupLR",
-            "params": {
-                "warmup_min_lr": 0,
-                "warmup_max_lr": 0.001,
-                "warmup_num_steps": 1000
-            }
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_lr_scheduler(args,
-                                      zero_stage,
-                                      hidden_dim,
-                                      load_optimizer_states,
-                                      load_lr_scheduler_states):
-        if zero_stage == 3:
-            global DeepSpeedZeroOptimizer_Stage3
-            from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-            with deepspeed.zero.Init():
-                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-        else:
-            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-        checkpoint_correctness_verification(
-            args,
-            models,
-            hidden_dim,
-            tmpdir,
-            load_optimizer_states=load_optimizer_states,
-            load_lr_scheduler_states=load_lr_scheduler_states)
-
-    _test_checkpoint_lr_scheduler(args=args,
-                                  zero_stage=zero_stage,
-                                  hidden_dim=hidden_dim,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=True)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload, adam_optimizer',
-                         [(0,
-                           False,
-                           'Adam'),
-                          (1,
-                           False,
-                           'Adam'),
-                          (2,
-                           False,
-                           'Adam'),
-                          (2,
-                           True,
-                           'deepspeed_adam'),
-                          (3,
-                           False,
-                           'Adam'),
-                          (3,
-                           True,
-                           'deepspeed_adam')])
-def test_checkpoint_no_lr_scheduler(tmpdir, zero_stage, use_cpu_offload, adam_optimizer):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 1e-5
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        },
-        "scheduler": {
-            "type": "WarmupLR",
-            "params": {
-                "warmup_min_lr": 0,
-                "warmup_max_lr": 0.001,
-                "warmup_num_steps": 1000
-            }
-        },
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_no_lr_scheduler(args,
-                                         zero_stage,
-                                         hidden_dim,
-                                         load_optimizer_states,
-                                         load_lr_scheduler_states):
-        if zero_stage == 3:
-            with deepspeed.zero.Init():
-                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-        else:
-            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-        checkpoint_correctness_verification(
-            args,
-            models,
-            hidden_dim,
-            tmpdir,
-            load_optimizer_states=load_optimizer_states,
-            load_lr_scheduler_states=load_lr_scheduler_states)
-
-    _test_checkpoint_no_lr_scheduler(args=args,
-                                     zero_stage=zero_stage,
-                                     hidden_dim=hidden_dim,
-                                     load_optimizer_states=False,
-                                     load_lr_scheduler_states=False)
-
-
-def test_checkpoint_fp32_optimizer(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": False
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_fp32_optimizer(args, models, hidden_dim):
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            fp16=False)
-
-    _test_checkpoint_fp32_optimizer(args=args, models=models, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize("zero_stage", [0, 1])
-def test_checkpoint_pipe_engine(zero_stage, tmpdir, stages=2):
-    config_dict = {
-        "train_batch_size": 2,
-        "train_micro_batch_size_per_gpu": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-5
-            }
-        },
-        "zero_optimization": {
-            "stage": zero_stage
-        },
-        "fp16": {
-            "enabled": zero_stage > 0
-        },
-        "scheduler": {
-            "type": "OneCycle",
-            "params": {
-                "cycle_first_step_size": 1000,
-                "cycle_first_stair_count": 500,
-                "cycle_second_step_size": 1000,
-                "cycle_second_stair_count": 500,
-                "decay_step_size": 1000,
-                "cycle_min_lr": 0.0001,
-                "cycle_max_lr": 0.0010,
-                "decay_lr_rate": 0.001,
-                "cycle_min_mom": 0.85,
-                "cycle_max_mom": 0.99,
-                "decay_mom_rate": 0.0
-            }
-        }
-    }
-
-    @distributed_test(world_size=4)
-    def _test(save_folder, num_stages):
-        args = args_from_dict(tmpdir, config_dict)
-        models = [LinearStackPipe(num_stages=num_stages) for _ in range(2)]
-        checkpoint_correctness_verification(args=args,
-                                            models=models,
-                                            hidden_dim=models[0].hidden_dim,
-                                            tmpdir=save_folder,
-                                            fp16=config_dict['fp16']['enabled'],
-                                            load_optimizer_states=True,
-                                            load_lr_scheduler_states=True,
-                                            train_batch=True)
-
-    _test(tmpdir, num_stages=stages)
-
-
-@pytest.mark.parametrize(
-    "base_topo,test_topo",
-    [
-        #(PipeTopo(num_pp=1,
-        #          num_dp=4),
-        # PipeTopo(num_pp=4,
-        #          num_dp=1)),
-        #(PipeTopo(num_pp=2,
-        #          num_dp=2),
-        # PipeTopo(num_pp=2,
-        #          num_dp=2)),
-        #(PipeTopo(num_pp=4,
-        #          num_dp=1),
-        # PipeTopo(num_pp=2,
-        #          num_dp=2)),
-    ])
-def test_checkpoint_pipe_module(base_topo, test_topo, tmpdir):
-    @distributed_test(world_size=4)
-    def _test(base_topo, test_topo, save_folder):
-        base_model = LinearStackPipe(topology=base_topo)
-        base_model.save_state_dict(save_folder)
-
-        dist.barrier()
-
-        test_model = LinearStackPipe(topology=test_topo)
-        test_model.load_state_dir(save_folder)
-
-        # Base and test can have different lengths, so make sure we map from the
-        # smaller to larger model
-        if len(base_model.forward_funcs) < len(test_model.forward_funcs):
-            A = base_model
-            B = test_model
-        else:
-            A = test_model
-            B = base_model
-
-        # Compare layers individually since partitions are different
-        for idx, A_layer in enumerate(A.forward_funcs):
-            if not hasattr(A_layer, 'parameters'):
-                # Skip functionals, etc.
-                continue
-
-            # Find the corresponding layer in B
-            global_idx = idx + A._local_start
-            B_local_idx = global_idx - B._local_start
-            B_layer = B.forward_funcs[B_local_idx]
-
-            # Compare layer parameters
-            for p0, p1 in zip(A_layer.parameters(), B_layer.parameters()):
-                assert torch.allclose(p0, p1, atol=1e-07), f"Model state {p0} is not equal to {p1}"
-
-    _test(base_topo, test_topo, save_folder=tmpdir)
-
-
-@pytest.mark.parametrize('zero_stage', [1, 2])
-def test_checkpoint_zero_hybrid_optimizer_state(tmpdir, zero_stage):
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage
-        },
-        "zero_allow_untested_optimizer": True,
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-    models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
-    optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
-
-    @distributed_test(world_size=[2])
-    def _test_checkpoint_zero_hybrid_optimizer_state(args,
-                                                     models,
-                                                     optimizers,
-                                                     hidden_dim):
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            base_optimizers=optimizers,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=True)
-
-    _test_checkpoint_zero_hybrid_optimizer_state(args=args,
-                                                 models=models,
-                                                 optimizers=optimizers,
-                                                 hidden_dim=hidden_dim)
-
-
-def test_checkpoint_latest(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        }
-    }
-    hidden_dim = 10
-    args = args_from_dict(tmpdir, config_dict)
-    models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
-
-    @distributed_test(world_size=[1])
-    def _helper(args, models):
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=True,
-                                            load_lr_scheduler_states=False,
-                                            fp16=False,
-                                            empty_tag=True)
-
-    _helper(args, models)
-
-
-def test_checkpoint_missing_latest(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        }
-    }
-    hidden_dim = 10
-    args = args_from_dict(tmpdir, config_dict)
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _helper(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        # should be no-op, since latest doesn't exist
-        model.load_checkpoint(tmpdir)
-
-    _helper(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('valid_mode', ["FAIL", "WARN", "IGNORE"])
-def test_checkpoint_unique_tag(tmpdir, valid_mode):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "checkpoint": {
-            "tag_validation": valid_mode
-        }
-    }
-    hidden_dim = 10
-    args = args_from_dict(tmpdir, config_dict)
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _helper(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        if valid_mode == "FAIL":
-            with pytest.raises(AssertionError):
-                model.save_checkpoint(save_dir=tmpdir,
-                                      tag=f"tag-{torch.distributed.get_rank()}")
-        else:
-            model.save_checkpoint(save_dir=tmpdir,
-                                  tag=f"tag-{torch.distributed.get_rank()}")
-
-    _helper(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_checkpoint_unknown_tag_validation(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "checkpoint": {
-            "tag_validation": "foo"
-        }
-    }
-    hidden_dim = 10
-    args = args_from_dict(tmpdir, config_dict)
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _helper(args, model, hidden_dim):
-        with pytest.raises(deepspeed.DeepSpeedConfigError):
-            model, _, _,_ = deepspeed.initialize(args=args,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
-
-    _helper(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize("ep_size", [4])
-def test_checkpoint_moe(tmpdir, ep_size):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 8,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    hidden_dim = 16
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[4])
-    def _helper(args):
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
-        optimizers = [torch.optim.AdamW(params=model.parameters()) for model in models]
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=True,
-                                            load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
-                                            empty_tag=True,
-                                            base_optimizers=optimizers,
-                                            seq_dataloader=True)
-
-    _helper(args)
-
-
-@pytest.mark.parametrize("ep_size, load_optim_states",
-                         [(4,
-                           True),
-                          (4,
-                           False),
-                          (2,
-                           True),
-                          (2,
-                           False)])
-def test_checkpoint_moe_and_zero(tmpdir, ep_size, load_optim_states):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 8,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.00015,
-                "betas": [0.8,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": 2,
-        }
-    }
-    hidden_dim = 16
-    args = args_from_dict(tmpdir, config_dict)
-
-    def create_param_groups(model):
-        # param group must have a random unique name (for now)
-        # TODO: clean-up this requirement, the unique name should not be required here
-        return {'params': [p for p in model.parameters()], 'name': 'random-unique-name'}
-
-    @distributed_test(world_size=[4])
-    def _helper(args):
-        models = [
-            SimpleMoEModel(hidden_dim=hidden_dim,
-                           num_experts=ep_size,
-                           ep_size=ep_size) for _ in range(2)
-        ]
-        params = [
-            split_params_into_different_moe_groups_for_optimizer(
-                create_param_groups(model)) for model in models
-        ]
-        optimizers = [torch.optim.AdamW(params=param) for param in params]
-        checkpoint_correctness_verification(args,
-                                            models=models,
-                                            hidden_dim=hidden_dim,
-                                            tmpdir=tmpdir,
-                                            load_optimizer_states=load_optim_states,
-                                            load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
-                                            empty_tag=True,
-                                            base_optimizers=optimizers,
-                                            seq_dataloader=True)
-
-    _helper(args)
-
-
-@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
-def test_checkpoint_load_module_only(tmpdir, zero_stage):
-    config_dict = {
-        "train_batch_size": 2,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[2])
-    def _go(args, zero_stage, hidden_dim):
-        if zero_stage == 3:
-            with deepspeed.zero.Init():
-                models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-        else:
-            models = [SimpleModel(hidden_dim, empty_grad=False) for _ in range(2)]
-
-        checkpoint_correctness_verification(args,
-                                            models,
-                                            hidden_dim,
-                                            tmpdir,
-                                            load_module_only=True)
-
-    _go(args, zero_stage, hidden_dim)
-
-
-@pytest.mark.parametrize(["to_save_model_has_embedding",
-                          "to_save_model_sparse"],
-                         [
-                             [False,
-                              False],
-                             [True,
-                              False],
-                             [True,
-                              True],
-                         ])
-@pytest.mark.parametrize(["destination_has_embedding",
-                          "destination_sparse"],
-                         [
-                             [False,
-                              False],
-                             [True,
-                              False],
-                             [True,
-                              True],
-                         ])
-def test_non_strict_load_sparse(tmpdir,
-                                to_save_model_has_embedding,
-                                to_save_model_sparse,
-                                destination_has_embedding,
-                                destination_sparse):
-    config_dict = {"train_batch_size": 2}
-
-    class ModelNoEmbedding(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = torch.nn.Linear(3, 1)
-
-        def forward(self, x):
-            return self.linear(x)
-
-    class ModelEmbedding(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.emb = torch.nn.Embedding(10, 3)
-            self.linear = torch.nn.Linear(3, 1)
-
-        def forward(self, x, offsets):
-            return self.linear(self.emb(x, offsets))
-
-    @distributed_test(world_size=[2])
-    def _test(model_to_save, model_destination):
-        engine_to_save, _, _, _ = deepspeed.initialize(
-            model=model_to_save, config={"train_batch_size": 2, "sparse_gradients": to_save_model_sparse}
-        )
-        engine_destination, _, _, _ = deepspeed.initialize(
-            model=model_destination, config={"train_batch_size": 2, "sparse_gradients": destination_sparse}
-        )
-
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        save_tag = '1'
-
-        engine_to_save.save_checkpoint(save_folder, tag=save_tag)
-
-        is_sparse_destination = isinstance(model_destination,
-                                           ModelEmbedding) and destination_sparse
-        if isinstance(model_destination,
-                      ModelEmbedding) and model_destination.emb.sparse:
-            assert "emb.weight" in engine_destination.sparse_tensor_module_names
-        engine_destination.load_checkpoint(save_folder,
-                                           tag=save_tag,
-                                           load_module_strict=False,
-                                           load_optimizer_states=False,
-                                           load_lr_scheduler_states=False,
-                                           load_module_only=False)
-        if isinstance(model_destination,
-                      ModelEmbedding) and isinstance(model_to_save,
-                                                     ModelEmbedding):
-            assert engine_destination.sparse_tensor_module_names == engine_to_save.sparse_tensor_module_names
-        elif isinstance(model_destination, ModelEmbedding):
-            assert not is_sparse_destination or "emb.weight" in engine_destination.sparse_tensor_module_names
-        else:
-            assert len(engine_destination.sparse_tensor_module_names) == 0
-
-    if to_save_model_has_embedding:
-        model_to_save = ModelEmbedding()
-    else:
-        model_to_save = ModelNoEmbedding()
-    if destination_has_embedding:
-        model_destination = ModelEmbedding()
-    else:
-        model_destination = ModelNoEmbedding()
-    _test(model_to_save, model_destination)
-
-
-@pytest.mark.parametrize(["elastic_save",
-                          "elastic_load",
-                          "load_optim"],
-                         itertools.product(*[[True,
-                                              False],
-                                             [True,
-                                              False],
-                                             [True,
-                                              False]]))
-def test_checkpoint_zero_elastic(tmpdir, elastic_save, elastic_load, load_optim):
-    ds_config = {
-        "train_batch_size": 2,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": 2,
-            "elastic_checkpoint": elastic_save
-        }
-    }
-    hidden_dim = 10
-
-    @distributed_test(world_size=[2])
-    def _go():
-        models = [SimpleModel(hidden_dim) for _ in range(2)]
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                              model=models[0],
-                                              model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-        model.save_checkpoint(tmpdir)
-
-        ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                              model=models[1],
-                                              model_parameters=models[1].parameters())
-        model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _go()
-
-
-@pytest.mark.parametrize(["elastic_save",
-                          "elastic_load",
-                          "load_optim"],
-                         itertools.product(*[[True,
-                                              False],
-                                             [True,
-                                              False],
-                                             [True,
-                                              False]]))
-def test_checkpoint_zero_elastic_dp_change(tmpdir,
-                                           elastic_save,
-                                           elastic_load,
-                                           load_optim):
-    ds_config = {
-        "train_batch_size": 4,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": 2,
-            "elastic_checkpoint": elastic_save
-        }
-    }
-    hidden_dim = 10
-    models = [SimpleModel(hidden_dim) for _ in range(2)]
-
-    @distributed_test(world_size=[4])
-    def _go2(models):
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                              model=models[0],
-                                              model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=8,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-        model.save_checkpoint(tmpdir)
-
-    _go2(models)
-
-    @distributed_test(world_size=[2])
-    def _go1(models):
-        ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
-        model, _, _, _ = deepspeed.initialize(config=ds_config,
-                                              model=models[1],
-                                              model_parameters=models[1].parameters())
-        if load_optim:
-            with pytest.raises(deepspeed.runtime.zero.utils.ZeRORuntimeException):
-                model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
-        else:
-            model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
-
-    _go1(models)
-
-
-@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
-def test_immediate_save_load(tmpdir, zero_stage):
-    config_dict = {
-        "train_batch_size": 4,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-        }
-    }
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_immediate_save_load(args, model, tmpdir):
-
-        ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None)
-        ds_model.save_checkpoint(tmpdir)
-        ds_model.load_checkpoint(tmpdir,
-                                 load_optimizer_states=False,
-                                 load_lr_scheduler_states=False,
-                                 load_module_only=False)
-
-    _test_immediate_save_load(args, model, tmpdir)
-
-
-@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
-def test_load_immediate_save(tmpdir, zero_stage):
-    config_dict = {
-        "train_batch_size": 4,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-        }
-    }
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_load_immediate_save(args, model, tmpdir):
-
-        # 1. pretrain a model and save it
-        dtype = torch.half
-        ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None)
-        data_loader = random_dataloader(model=ds_model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=ds_model.device,
-                                        dtype=dtype)
-        for n, batch in enumerate(data_loader):
-            loss = ds_model(batch[0], batch[1])
-            ds_model.backward(loss)
-            ds_model.step()
-        ds_model.save_checkpoint(tmpdir)
-
-        # 2. load and immediately save a model with a fresh ds engine
-        ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None)
-        ds_model.load_checkpoint(tmpdir,
-                                 load_optimizer_states=False,
-                                 load_lr_scheduler_states=False,
-                                 load_module_only=False)
-        ds_model.save_checkpoint(tmpdir)
-
-    _test_load_immediate_save(args, model, tmpdir)
-
-
-@pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
-def test_save_before_accum_grad_is_done(tmpdir, zero_stage):
-    config_dict = {
-        "train_batch_size": 4,
-        "optimizer": {
-            "type": 'Adam'
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "stage3_gather_fp16_weights_on_model_save": True,
-        },
-        "gradient_accumulation_steps": 2,
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": 2,
-    }
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_save_before_accum_grad_is_done(args, model, tmpdir):
-
-        # This test reproduces a bug where one tries to retrieve a 16bit model before grad_accum
-        # cycle was completed.
-        # So we config grad_accum=2 and step only once and save_16bit_model
-        ds_model = create_deepspeed_model(args=args, model=model, base_optimizer=None)
-
-        data_loader = random_dataloader(model=ds_model,
-                                        total_samples=2,
-                                        hidden_dim=hidden_dim,
-                                        device=ds_model.device,
-                                        dtype=torch.half)
-
-        batch = next(iter(data_loader))
-        loss = ds_model(batch[0], batch[1])
-        ds_model.backward(loss)
-        ds_model.step()
-
-        # we stepped only once, and now save 16bit model before gradient_accumulation_steps=2 is complete
-        ds_model.save_16bit_model(tmpdir, "model.pt")
-
-        # let's test just as well that we can save the checkpoint too
-        ds_model.save_checkpoint(tmpdir)
-
-    _test_save_before_accum_grad_is_done(args, model, tmpdir)
diff --git a/tests/unit/test_coalesced_collectives.py b/tests/unit/test_coalesced_collectives.py
deleted file mode 100644
index fb6b5354a1589fc156aacdaf99fb192e3dc95324..0000000000000000000000000000000000000000
--- a/tests/unit/test_coalesced_collectives.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""unit tests for coalesced collectives"""
-
-import pytest
-
-import torch
-import torch.distributed as dist
-from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
-
-from .common import distributed_test
-
-
-@distributed_test(world_size=2)
-def test_reduce_scatter_coalesced_single_input():
-    input = torch.full((6,
-                        ),
-                       dist.get_rank(),
-                       dtype=torch.half,
-                       device=torch.cuda.current_device())
-
-    (output, ) = reduce_scatter_coalesced([input], dist.group.WORLD)
-
-    assert output.shape == (3, )
-    assert torch.allclose(output, torch.full_like(output, 0.5))
-
-
-@distributed_test(world_size=2)
-def test_reduce_scatter_coalesced_two_inputs():
-    tensor_kwargs = {"device": torch.cuda.current_device(), "dtype": torch.half}
-    inputs = [
-        dist.get_rank() * torch.arange(0,
-                                       6,
-                                       **tensor_kwargs),
-        dist.get_rank() * torch.arange(6,
-                                       9,
-                                       **tensor_kwargs),
-    ]
-
-    output1, output2 = reduce_scatter_coalesced(inputs, dist.group.WORLD)
-
-    if dist.get_rank() == 0:
-        assert output1.shape == (3, )
-        assert torch.allclose(output1, torch.arange(0, 3, **tensor_kwargs) / 2)
-        assert output2.shape == (2, )
-        assert torch.allclose(output2, torch.arange(6, 8, **tensor_kwargs) / 2)
-    elif dist.get_rank() == 1:
-        assert output1.shape == (3, )
-        assert torch.allclose(output1, torch.arange(3, 6, **tensor_kwargs) / 2)
-        assert output2.shape == (1, )
-        assert torch.allclose(output2, torch.arange(8, 9, **tensor_kwargs) / 2)
-
-
-@distributed_test(world_size=2)
-def test_reduce_scatter_coalesced_tensor_smaller_than_world_sz():
-    input = torch.zeros((1, ), dtype=torch.half, device=torch.cuda.current_device())
-
-    (output, ) = reduce_scatter_coalesced([input], dist.group.WORLD)
-
-    if dist.get_rank() == 0:
-        assert output.shape == (1, )
-        assert torch.allclose(output, torch.zeros_like(output))
-    elif dist.get_rank() == 1:
-        assert output.shape == (0, )
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
deleted file mode 100644
index a88cb2931d95efe76a499681326293ad682198ad..0000000000000000000000000000000000000000
--- a/tests/unit/test_config.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# A test on its own
-import torch
-import pytest
-import json
-import argparse
-
-from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-
-from .common import distributed_test, get_test_path
-from .simple_model import SimpleModel, create_config_from_dict, random_dataloader
-import torch.distributed as dist
-
-# A test on its own
-import deepspeed
-from deepspeed.runtime.config import DeepSpeedConfig, get_bfloat16_enabled
-
-
-def test_cuda():
-    assert (torch.cuda.is_available())
-
-
-def test_check_version():
-    assert hasattr(deepspeed, "__git_hash__")
-    assert hasattr(deepspeed, "__git_branch__")
-    assert hasattr(deepspeed, "__version__")
-    assert hasattr(deepspeed, "__version_major__")
-    assert hasattr(deepspeed, "__version_minor__")
-    assert hasattr(deepspeed, "__version_patch__")
-
-
-def _run_batch_config(ds_config, train_batch=None, micro_batch=None, gas=None):
-    ds_config.train_batch_size = train_batch
-    ds_config.train_micro_batch_size_per_gpu = micro_batch
-    ds_config.gradient_accumulation_steps = gas
-    success = True
-    try:
-        ds_config._configure_train_batch_size()
-    except AssertionError:
-        success = False
-    return success
-
-
-def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
-
-    if not success:
-        assert not status
-        print("Failed but All is well")
-        return
-
-    assert ds_config.train_batch_size == batch
-    assert ds_config.train_micro_batch_size_per_gpu == micro_batch
-    assert ds_config.gradient_accumulation_steps == gas
-    print("All is well")
-
-
-#Tests different batch config provided in deepspeed json file
-@pytest.mark.parametrize('num_ranks,batch,micro_batch,gas,success',
-                         [(2,32,16,1,True),
-                         (2,32,8,2,True),
-                         (2,33,17,2,False),
-                         (2,32,18,1,False)]) # yapf: disable
-def test_batch_config(num_ranks, batch, micro_batch, gas, success):
-    @distributed_test(world_size=2)
-    def _test_batch_config(num_ranks, batch, micro_batch, gas, success):
-        assert dist.get_world_size() == num_ranks, \
-        'The test assumes a world size of f{num_ranks}'
-
-        ds_batch_config = get_test_path('ds_batch_config.json')
-        ds_config = DeepSpeedConfig(ds_batch_config)
-
-        #test cases when all parameters are provided
-        status = _run_batch_config(ds_config,
-                                   train_batch=batch,
-                                   micro_batch=micro_batch,
-                                   gas=gas)
-        _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-        #test cases when two out of three parameters are provided
-        status = _run_batch_config(ds_config, train_batch=batch, micro_batch=micro_batch)
-        _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-        if success:
-            #when gas is provided with one more parameter
-            status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
-            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-            status = _run_batch_config(ds_config, micro_batch=micro_batch, gas=gas)
-            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-            #test the case when only micro_batch or train_batch is provided
-            if gas == 1:
-                status = _run_batch_config(ds_config, micro_batch=micro_batch)
-                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-                status = _run_batch_config(ds_config, train_batch=batch)
-                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-        else:
-            #when only gas is provided
-            status = _run_batch_config(ds_config, gas=gas)
-            _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-            #when gas is provided with something else and gas does not divide batch
-            if gas != 1:
-                status = _run_batch_config(ds_config, train_batch=batch, gas=gas)
-                _batch_assert(status, ds_config, batch, micro_batch, gas, success)
-
-    """Run batch config test """
-    _test_batch_config(num_ranks, batch, micro_batch, gas, success)
-
-
-def test_temp_config_json(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-    }
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    config_json = json.load(open(config_path, 'r'))
-    assert 'train_batch_size' in config_json
-
-
-@pytest.mark.parametrize("gather_weights_key",
-                         [
-                             "stage3_gather_16bit_weights_on_model_save",
-                             "stage3_gather_fp16_weights_on_model_save"
-                         ])
-def test_gather_16bit_params_on_model_save(gather_weights_key):
-    config_dict = {
-        "zero_optimization": {
-            gather_weights_key: True,
-        },
-    }
-    config = DeepSpeedZeroConfig(config_dict)
-
-    assert config.gather_16bit_weights_on_model_save == True
-
-
-@pytest.mark.parametrize("bf16_key", ["bf16", "bfloat16"])
-def test_get_bfloat16_enabled(bf16_key):
-    cfg = {
-        bf16_key: {
-            "enabled": True,
-        },
-    }
-    assert get_bfloat16_enabled(cfg) == True
-
-
-def test_deprecated_deepscale_config(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    parser = argparse.ArgumentParser()
-    args = parser.parse_args(args='')
-    args.deepscale_config = config_path
-    args.local_rank = 0
-
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_deprecated_deepscale_config(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_deprecated_deepscale_config(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_dist_init_true(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    config_path = create_config_from_dict(tmpdir, config_dict)
-    parser = argparse.ArgumentParser()
-    args = parser.parse_args(args='')
-    args.deepscale_config = config_path
-    args.local_rank = 0
-
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_dist_init_true(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters(),
-                                             dist_init_required=True)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_dist_init_true(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_init_no_optimizer(tmpdir):
-
-    config_dict = {"train_batch_size": 1, "fp16": {"enabled": True}}
-    config_path = create_config_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _helper():
-        parser = argparse.ArgumentParser()
-        args = parser.parse_args(args='')
-        args.deepscale_config = config_path
-        args.local_rank = 0
-
-        hidden_dim = 10
-
-        model = SimpleModel(hidden_dim=hidden_dim)
-
-        model, _, _, _ = deepspeed.initialize(args=args, model=model)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            with pytest.raises(AssertionError):
-                model.backward(loss)
-            with pytest.raises(AssertionError):
-                model.step()
-
-    _helper()
-
-
-def test_none_args(tmpdir):
-    config = {
-        "train_batch_size": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    @distributed_test(world_size=1)
-    def _helper():
-        model = SimpleModel(hidden_dim=10)
-        model, _, _, _ = deepspeed.initialize(args=None, model=model, config=config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-
-    _helper()
-
-
-def test_no_args(tmpdir):
-    config = {
-        "train_batch_size": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    @distributed_test(world_size=1)
-    def _helper():
-        model = SimpleModel(hidden_dim=10)
-        model, _, _, _ = deepspeed.initialize(model=model, config=config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=5,
-                                        hidden_dim=10,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-
-    _helper()
-
-
-def test_no_model(tmpdir):
-    config = {
-        "train_batch_size": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-    @distributed_test(world_size=1)
-    def _helper():
-        model = SimpleModel(hidden_dim=10)
-        with pytest.raises(AssertionError):
-            model, _, _, _ = deepspeed.initialize(model=None, config=config)
-
-        with pytest.raises(AssertionError):
-            model, _, _, _ = deepspeed.initialize(model, config=config)
diff --git a/tests/unit/test_configurable_parallel.py b/tests/unit/test_configurable_parallel.py
deleted file mode 100644
index 35486181072b8ae1cfbb5a0657d22764b217532d..0000000000000000000000000000000000000000
--- a/tests/unit/test_configurable_parallel.py
+++ /dev/null
@@ -1,453 +0,0 @@
-import torch
-import deepspeed
-import pytest
-import os
-import time
-import random
-import numpy as np
-import torch.multiprocessing as mp
-import torch.distributed as dist
-from .common import distributed_test
-from .simple_model import args_from_dict, create_deepspeed_args
-from .megatron_model import get_gpt2_model, get_megatron_version
-from .megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
-from deepspeed.utils import RepeatingLoader
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(
-    TORCH_MAJOR < 1 or (TORCH_MAJOR == 1 and TORCH_MINOR < 5),
-    reason='Megatron-LM package requires Pytorch version 1.5 or above')
-
-
-def reset_random(seed=1234):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-
-
-class TestConfigurableMP:
-    def setup_method(self, method):
-        reset_random()
-
-    def get_inputs(self, bs=1, seq_len=20):
-        input_ids = torch.randint(low=0, high=1000, size=(bs, seq_len))
-        position_ids = torch.randint(low=0, high=2, size=(bs, seq_len))
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
-        return [input_ids, position_ids, attention_mask]
-
-    def get_deepspeed_model(self, model, tmpdir):
-        ds_config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Lamb",
-                "params": {
-                    "lr": 0.00015
-                }
-            },
-        }
-
-        from megatron import mpu
-        model, _, _,_ = deepspeed.initialize(model=model,
-                                             mpu=mpu,
-                                             model_parameters=model.parameters(),
-                                             config=ds_config_dict)
-        return model
-
-    def test_gpt2_basic(self, tmpdir):
-        # basic test case, mp_size=1, verify ckpt saving/loading.
-
-        @distributed_test(world_size=1)
-        def _run():
-            inputs = self.get_inputs()
-            args_defaults = {
-                'num_layers': 2,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            model = get_gpt2_model(args_defaults)
-            model = self.get_deepspeed_model(model, tmpdir)
-
-            model.eval()
-            baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-
-            tag = 'mp_1'
-            state_dict = {}
-            state_dict['checkpoint_version'] = get_megatron_version()
-            model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
-            dist.barrier()
-            model.load_checkpoint(tmpdir,
-                                  tag=tag,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=False)
-
-            test = model(inputs[0], inputs[1], inputs[2])
-            assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
-
-        _run()
-
-    def test_gpt2_mp2_no_resize(self, tmpdir):
-        # test mp_size=2 case, verify ckpt saving/loading without resize.
-
-        @distributed_test(world_size=2)
-        def _run(inputs):
-            args_defaults = {
-                'num_layers': 2,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            model = get_gpt2_model(args_defaults, mp_size=2)
-            model = self.get_deepspeed_model(model, tmpdir)
-
-            model.eval()
-
-            baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-
-            tag = 'mp_2'
-            state_dict = {}
-            state_dict['checkpoint_version'] = get_megatron_version()
-            model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
-            dist.barrier()
-            model.load_checkpoint(tmpdir,
-                                  tag=tag,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=False)
-
-            test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-            assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
-
-        inputs = self.get_inputs()
-        _run(inputs)
-
-    def _test_gpt2_config_mp(self, tmpdir, mp_size, resize):
-        # test mp_size=2 case, verify resize=1 case for ckpt merging.
-
-        @distributed_test(world_size=mp_size)
-        def _run_baseline(inputs, tag, output, quit_event):
-            reset_random()
-            args_defaults = {
-                'num_layers': 2,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            model = get_gpt2_model(args_defaults, mp_size=mp_size)
-            model = self.get_deepspeed_model(model, tmpdir)
-
-            model.eval()
-
-            with torch.no_grad():
-                baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-                if dist.get_rank() == 0:
-                    output.put(baseline.cpu())
-
-                state_dict = {}
-                state_dict['checkpoint_version'] = get_megatron_version()
-                model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
-                quit_event.wait()
-
-        @distributed_test(world_size=resize)
-        def _run_resize(inputs, tag, output, quit_event):
-            reset_random()
-            args_defaults = {
-                'num_layers': 2,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            model = get_gpt2_model(args_defaults, mp_size=resize)
-            model = self.get_deepspeed_model(model, tmpdir)
-
-            model.eval()
-
-            with torch.no_grad():
-                model.load_checkpoint(tmpdir,
-                                      tag=tag,
-                                      load_optimizer_states=False,
-                                      load_lr_scheduler_states=False)
-                test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-                if dist.get_rank() == 0:
-                    output.put(test.cpu())
-            quit_event.wait()
-
-        def _verify(b_queue, t_queue, baseline_event, test_event):
-            baseline = b_queue.get()
-            baseline_event.set()
-
-            test = t_queue.get()
-            test_event.set()
-
-            assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
-
-        tag = f'mp_{mp_size}_resize_{resize}'
-        inputs = self.get_inputs()
-
-        baseline = mp.Queue()
-        test = mp.Queue()
-        baseline_event = mp.Event()
-        test_event = mp.Event()
-
-        verify_process = mp.Process(target=_verify,
-                                    args=(baseline,
-                                          test,
-                                          baseline_event,
-                                          test_event))
-        verify_process.start()
-
-        _run_baseline(inputs, tag, baseline, baseline_event)
-        _run_resize(inputs, tag, test, test_event)
-
-        verify_process.join()
-
-    def test_gpt2_mp_2to1(self, tmpdir):
-        # test mp_size=2 case, verify resize=1 case for ckpt merging.
-        self._test_gpt2_config_mp(tmpdir, mp_size=2, resize=1)
-
-    def test_gpt2_mp_2to4(self, tmpdir):
-        # test mp_size=2 case, verify resize=4 case for ckpt splitting.
-        self._test_gpt2_config_mp(tmpdir, mp_size=2, resize=4)
-
-
-class TestConfigurablePP:
-    def setup_method(self, method):
-        reset_random()
-
-    def get_inputs(self, bs=1, seq_len=1, hidden_size=128):
-        hidden_states = torch.randn(bs, seq_len, hidden_size)
-        attention_mask = torch.randint(low=0,
-                                       high=2,
-                                       size=(bs,
-                                             seq_len),
-                                       dtype=torch.bool)
-        return (hidden_states, attention_mask)
-
-    def get_deepspeed_model(self, model, tmpdir):
-        ds_config_dict = {
-            "train_micro_batch_size_per_gpu": 1,
-            "optimizer": {
-                "type": "Lamb",
-                "params": {
-                    "lr": 0.00015
-                }
-            },
-        }
-        dist.barrier()
-
-        model, _, _,_ = deepspeed.initialize(model=model,
-                                             model_parameters=model.parameters(),
-                                             config=ds_config_dict)
-        return model.cuda()
-
-    def get_topology(self, mp, pp, world_size):
-        assert world_size % (pp * mp) == 0
-        dp = world_size // (pp * mp)
-
-        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
-        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)
-
-        return topo
-
-    def test_pp_basic(self, tmpdir):
-        # basic test case, mp_size=2, pp_size=2, verify ckpt saving/loading.
-
-        mp_size = 2
-        pp_size = 2
-        world_size = mp_size * pp_size
-
-        @distributed_test(world_size=world_size)
-        def _run():
-            args_defaults = {
-                'num_layers': 8,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            topo = self.get_topology(mp_size, pp_size, world_size)
-            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
-                                            num_stages=pp_size,
-                                            mp_size=mp_size,
-                                            args_others=args_defaults,
-                                            topo=topo)
-            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)
-
-            tag = 'pp_basic'
-            state_dict = {}
-            state_dict['checkpoint_version'] = get_megatron_version()
-            model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
-
-            if model.is_first_stage() or model.is_last_stage():
-                inputs = self.get_inputs()
-                loader = RepeatingLoader([(inputs[0], 0)])
-                data_iter = iter(loader)
-            else:
-                data_iter = None
-
-            baseline = model.eval_batch(data_iter=data_iter,
-                                        compute_loss=False,
-                                        reduce_output=None)
-
-            dist.barrier()
-            model.load_checkpoint(tmpdir,
-                                  tag=tag,
-                                  load_optimizer_states=False,
-                                  load_lr_scheduler_states=False)
-            dist.barrier()
-
-            test = model.eval_batch(data_iter=data_iter,
-                                    compute_loss=False,
-                                    reduce_output=None)
-
-            if test is not None:
-                assert len(baseline) == len(test)
-                # Compare outputs of each microbatch
-                for mb in range(len(baseline)):
-                    for b, t in zip(baseline[mb], test[mb]):
-                        if b.is_floating_point():  # don't compare masks
-                            assert torch.allclose(b, t, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
-
-        _run()
-
-    def _test_gpt2_config_pp(self, tmpdir, mp_size, pp_size, mp_resize, pp_resize):
-        @distributed_test(world_size=pp_size * mp_size)
-        def _run_baseline(inputs, tag, output, quit_event):
-            reset_random()
-            args_defaults = {
-                'num_layers': 8,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            topo = self.get_topology(mp_size, pp_size, mp_size * pp_size)
-            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
-                                            num_stages=pp_size,
-                                            mp_size=mp_size,
-                                            args_others=args_defaults,
-                                            topo=topo)
-            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)
-
-            with torch.no_grad():
-                inputs = [x.cuda() for x in inputs]
-                if model.is_first_stage() or model.is_last_stage():
-                    loader = RepeatingLoader([(inputs[0], 0)])
-                    data_iter = iter(loader)
-                else:
-                    data_iter = None
-
-                baseline = model.eval_batch(data_iter=data_iter,
-                                            compute_loss=False,
-                                            reduce_output=None)
-
-                if baseline is not None:
-                    # baseline should be [[hidden, True]]]
-                    assert len(baseline) == 1
-                    assert len(baseline[0]) == 1
-                    assert torch.is_tensor(baseline[0][0])
-                    output.put(baseline[0][0].cpu())
-
-                state_dict = {}
-                state_dict['checkpoint_version'] = get_megatron_version()
-                model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
-                quit_event.wait()
-
-        @distributed_test(world_size=mp_resize * pp_resize)
-        def _run_resize(inputs, tag, output, quit_event):
-            reset_random()
-            args_defaults = {
-                'num_layers': 8,
-                'hidden_size': 128,
-                'num_attention_heads': 8,
-                'max_position_embeddings': 128,
-            }
-
-            topo = self.get_topology(mp_resize, pp_resize, mp_resize * pp_resize)
-            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
-                                            num_stages=pp_resize,
-                                            mp_size=mp_resize,
-                                            args_others=args_defaults,
-                                            topo=topo)
-            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)
-
-            with torch.no_grad():
-                model.load_checkpoint(tmpdir,
-                                      tag=tag,
-                                      load_optimizer_states=False,
-                                      load_lr_scheduler_states=False)
-                inputs = [x.cuda() for x in inputs]
-                if model.is_first_stage() or model.is_last_stage():
-                    loader = RepeatingLoader([(inputs[0], 0)])
-                    data_iter = iter(loader)
-                else:
-                    data_iter = None
-
-                test = model.eval_batch(data_iter=data_iter,
-                                        compute_loss=False,
-                                        reduce_output=None)
-
-                if test is not None:
-                    # test should be [[hidden, True]]]
-                    assert len(test) == 1
-                    assert len(test[0]) == 1
-                    assert torch.is_tensor(test[0][0])
-                    output.put(test[0][0].cpu())
-
-            quit_event.wait()
-
-        def _verify(b_queue, t_queue, baseline_event, test_event):
-            baseline = b_queue.get()
-            baseline_event.set()
-
-            test = t_queue.get()
-            test_event.set()
-
-            assert torch.allclose(baseline, test, atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
-
-        tag = f'mp_{mp_size}to{mp_resize}_pp_{pp_size}to{pp_resize}'
-
-        baseline = mp.Queue()
-        test = mp.Queue()
-        baseline_event = mp.Event()
-        test_event = mp.Event()
-
-        verify_process = mp.Process(target=_verify,
-                                    args=(baseline,
-                                          test,
-                                          baseline_event,
-                                          test_event))
-        verify_process.start()
-
-        inputs = self.get_inputs()
-        _run_baseline(inputs, tag, baseline, baseline_event)
-        _run_resize(inputs, tag, test, test_event)
-
-        verify_process.join()
-
-    def test_gpt2_mp1_pp_2to1(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=2, mp_resize=1, pp_resize=1)
-
-    def test_gpt2_mp1_pp_2to4(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=2, mp_resize=1, pp_resize=4)
-
-    def test_gpt2_mp2_pp_2to1(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=2, mp_resize=2, pp_resize=1)
-
-    def test_gpt2_mp2_pp_1to2(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=1, mp_resize=2, pp_resize=2)
-
-    def test_gpt2_pp_2to1_mp_2to1(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=2, pp_size=2, mp_resize=1, pp_resize=1)
-
-    def test_gpt2_pp_1to2_mp_1to2(self, tmpdir):
-        self._test_gpt2_config_pp(tmpdir, mp_size=1, pp_size=1, mp_resize=2, pp_resize=2)
diff --git a/tests/unit/test_cpu_adagrad.py b/tests/unit/test_cpu_adagrad.py
deleted file mode 100644
index 66e246ed23fc1e1e2d5adc8302c087c270a19e2b..0000000000000000000000000000000000000000
--- a/tests/unit/test_cpu_adagrad.py
+++ /dev/null
@@ -1,136 +0,0 @@
-import torch
-import numpy as np
-import pytest
-
-import deepspeed
-from deepspeed.ops.adagrad import DeepSpeedCPUAdagrad
-from deepspeed.ops.op_builder import CPUAdagradBuilder
-
-if not deepspeed.ops.__compatible_ops__[CPUAdagradBuilder.NAME]:
-    pytest.skip("cpu-adagrad is not compatible")
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
-    if verbose:
-        print("x = {}".format(x.flatten()))
-        print("y = {}".format(y.flatten()))
-        print('-' * 80)
-    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
-
-
-@pytest.mark.parametrize('model_size',
-                         [
-                             (64),
-                             (22),
-                             (55),
-                             (127),
-                             (1024),
-                             (1048576),
-                             (30000000),
-                         ]) # yapf: disable
-def test_cpu_adagrad_opt(model_size):
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-
-    optimizer = DeepSpeedCPUAdagrad([param])
-    optimizer1 = torch.optim.Adagrad([param1])
-
-    for i in range(10):
-        rng_state = torch.get_rng_state()
-        param.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = torch.randn(model_size, device=device)
-        optimizer.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
-
-
-@pytest.mark.parametrize('model_size,vocabulary_size,dim',
-                         [
-                             (16 * 2, 16 * 4, 16),
-                             (16 * 32, 16 * 256, 16),
-                             (16 * 256, 16 * 16384, 16),
-                         ]) # yapf: disable
-def test_cpu_adagrad_opt_sparse_embedding(model_size, vocabulary_size, dim):
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-
-    def gen_sparse_grad(vocabulary_size, dim, num_indices, dtype, device):
-        i = torch.randint(vocabulary_size,
-                          size=(1,
-                                num_indices),
-                          dtype=torch.int64,
-                          device=device)
-        v = torch.randn(num_indices, dim, dtype=dtype, device=device)
-        t = torch.sparse_coo_tensor(i, v, (vocabulary_size, dim), device=device)
-        t = t.coalesce()
-        new_i = (t.indices().view(-1,
-                                  1).repeat(1,
-                                            dim) * dim +
-                 torch.tensor(range(dim))).flatten().unsqueeze(0)
-        new_v = t.values().flatten()
-        new_t = torch.sparse_coo_tensor(new_i,
-                                        new_v,
-                                        (vocabulary_size * dim,
-                                         ),
-                                        device=device)
-        new_t = new_t.coalesce()
-        new_t.requires_grad = False
-        return new_t
-
-    voc_size = vocabulary_size
-    dim = dim
-    num_indices = int(model_size // dim)
-    dtype = torch.float32
-
-    param = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                            ),
-                                           dtype=dtype,
-                                           device=device),
-                               requires_grad=True)
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn((voc_size * dim,
-                                             ),
-                                            dtype=dtype,
-                                            device=device),
-                                requires_grad=True)
-    torch.set_rng_state(rng_state)
-
-    optimizer = DeepSpeedCPUAdagrad([param])
-    optimizer1 = torch.optim.Adagrad([param1])
-
-    for i in range(10):
-        torch.set_rng_state(rng_state)
-        param.grad = gen_sparse_grad(voc_size,
-                                     dim,
-                                     num_indices,
-                                     dtype=dtype,
-                                     device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = gen_sparse_grad(voc_size,
-                                      dim,
-                                      num_indices,
-                                      dtype=dtype,
-                                      device=device)
-        optimizer.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
-
-
-def test_cpu_adam_gpu_error():
-    model_size = 64
-    device = 'cuda:0'
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    optimizer = DeepSpeedCPUAdagrad([param])
-
-    param.grad = torch.randn(model_size, device=device)
-    with pytest.raises(AssertionError):
-        optimizer.step()
diff --git a/tests/unit/test_cpu_adam.py b/tests/unit/test_cpu_adam.py
deleted file mode 100644
index bec8faf89aea2ff8529f7f6b88424442dd3dd65f..0000000000000000000000000000000000000000
--- a/tests/unit/test_cpu_adam.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import argparse
-import torch
-import time
-import numpy as np
-import pytest
-import copy
-
-import deepspeed
-from deepspeed.ops.adam import FusedAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
-
-if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-    pytest.skip("cpu-adam is not compatible")
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
-    if verbose:
-        print("x = {}".format(x.flatten()))
-        print("y = {}".format(y.flatten()))
-        print('-' * 80)
-    np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
-
-@pytest.mark.parametrize('model_size',
-                         [
-                             (64),
-                             (22),
-                             (55),
-                             (127),
-                             (1024),
-                             (1048576),
-                         ]) # yapf: disable
-def test_cpu_adam_opt(model_size):
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
-    device = 'cpu'
-    rng_state = torch.get_rng_state()
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param1 = torch.nn.Parameter(torch.randn(model_size, device=device))
-    torch.set_rng_state(rng_state)
-    param2_data = torch.randn(model_size, device=device).cuda()
-    param2 = torch.nn.Parameter(param2_data)
-
-    optimizer1 = torch.optim.AdamW([param1])
-    optimizer2 = FusedAdam([param2])
-    optimizer = DeepSpeedCPUAdam([param])
-
-    for i in range(10):
-        rng_state = torch.get_rng_state()
-        param.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param1.grad = torch.randn(model_size, device=device)
-        torch.set_rng_state(rng_state)
-        param2.grad = torch.randn(model_size, device=device).cuda()
-
-        optimizer.step()
-        optimizer2.step()
-        optimizer1.step()
-
-    check_equal(param, param1, atol=1e-2, verbose=True)
-    check_equal(param, param2.cpu(), atol=1e-2, verbose=True)
-
-
-def test_cpu_adam_gpu_error():
-    model_size = 64
-    from deepspeed.ops.adam import DeepSpeedCPUAdam
-    device = 'cuda:0'
-    param = torch.nn.Parameter(torch.randn(model_size, device=device))
-    optimizer = DeepSpeedCPUAdam([param])
-
-    param.grad = torch.randn(model_size, device=device)
-    with pytest.raises(AssertionError):
-        optimizer.step()
diff --git a/tests/unit/test_csr.py b/tests/unit/test_csr.py
deleted file mode 100644
index bd5f9933d2a20949bb95d297acfc0ab3a71a5ade..0000000000000000000000000000000000000000
--- a/tests/unit/test_csr.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import torch
-import random
-from deepspeed.runtime.sparse_tensor import SparseTensor
-
-
-def test_csr_addition_self():
-    row_count = 10
-    random.seed(1234)
-
-    x = torch.ones(1, 5)
-    for i in range(row_count - 1):
-        if random.random() > 0.75:
-            x = torch.cat([x, torch.ones(1, 5)])
-        else:
-            x = torch.cat([x, torch.zeros(1, 5)])
-    dense_x = x.clone()
-    cx = SparseTensor(x)
-
-    assert torch.all(dense_x == cx.to_dense())
-
-    cx.add(cx)
-    assert torch.all(dense_x + dense_x == cx.to_dense())
-
-
-def test_csr_addition_different():
-    row_count = 10
-    random.seed(1234)
-
-    x = torch.ones(1, 5)
-    for i in range(row_count - 1):
-        if random.random() > 0.75:
-            x = torch.cat([x, torch.ones(1, 5)])
-        else:
-            x = torch.cat([x, torch.zeros(1, 5)])
-    dense_x = x.clone()
-    cx = SparseTensor(x)
-
-    y = torch.ones(1, 5)
-    for i in range(row_count - 1):
-        if random.random() > 0.75:
-            y = torch.cat([y, torch.ones(1, 5)])
-        else:
-            y = torch.cat([y, torch.zeros(1, 5)])
-    dense_y = y.clone()
-    cy = SparseTensor(y)
-
-    dense_sum = dense_x + dense_y
-    cx.add(cy)
-
-    assert torch.all(dense_sum == cx.to_dense())
diff --git a/tests/unit/test_cuda_backward.py b/tests/unit/test_cuda_backward.py
deleted file mode 100644
index d7faee7c05029df6c0970dde1604ab6b91ee9c00..0000000000000000000000000000000000000000
--- a/tests/unit/test_cuda_backward.py
+++ /dev/null
@@ -1,340 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-import pytest
-import random
-import copy
-from torch import nn
-from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from .modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln
-from .modelingpreln import BertEncoder as BertEncoderPreln
-
-#if not deepspeed.ops.__installed_ops__['transformer']:
-#pytest.skip(
-#    "transformer kernels are temporarily disabled because of unexplained failures",
-#    allow_module_level=True)
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    diction_x = {}
-    diction_y = {}
-
-    if verbose:
-        for i, (x, y) in enumerate(zip(first, second)):
-            print(x[1], y[1])
-
-    for i, (x, y) in enumerate(zip(first, second)):
-        k = 0
-        while (diction_x.get((k, x[1])) is not None):
-            k = k + 1
-        diction_x[k, x[1]] = x[0]
-        k = 0
-        while (diction_y.get((k, y[1])) is not None):
-            k = k + 1
-        diction_y[k, y[1]] = y[0]
-    if verbose:
-        print()
-        for i, (x, y) in enumerate(zip(diction_x, diction_y)):
-            print(x, y)
-
-    for i, (x, y) in enumerate(zip(diction_x, diction_y)):
-        if (x[0] == 1): continue
-        if verbose:
-            print("checking ", x[1], ":")
-        y = diction_y[x[0], x[1]]
-        x = diction_x[x[0], x[1]]
-
-        if verbose:
-            print(((x == float('inf')).nonzero(as_tuple=True)[0]))
-            print(((y == float('inf')).nonzero(as_tuple=True)[0]))
-        x = x.cpu().detach().numpy()
-        y = y.cpu().detach().numpy()
-
-        avgx = np.sum(abs(x), dtype=float)
-        countx = x.shape[0]
-        for i in range(len(x.shape) - 1):
-            countx *= x.shape[i + 1]
-            avgx = np.sum(avgx)
-        tolerance = 1
-        if avgx != float('inf') and avgx != -float('inf'):
-            avgx = avgx / countx
-            tolerance = avgx * atol
-        if verbose:
-            print("tolerance is ", tolerance)
-            x = x.flatten()
-            y = y.flatten()
-            print("x = {}".format(x))
-            print("y = {}".format(y))
-            if any(x == float('inf')) or any(x == -float('inf')):
-                print("found infinity in x")
-            if any(y == float('inf')) or any(y == -float('inf')):
-                print("found infinity in y")
-            print(np.linalg.norm(x.astype('float64')))
-            print(np.linalg.norm(y.astype('float64')))
-            print('-' * 80)
-        #toler = np.linalg.norm(x.astype('float64')) * 0.0005
-        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=tolerance)
-
-
-def zero_grad(variables):
-    for variable in variables:
-        variable.grad.zero_()
-
-
-device = torch.device("cuda")
-kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
-kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
-
-
-class DSEncoder(nn.Module):
-    def __init__(self, config, weights, biases):
-        super(DSEncoder, self).__init__()
-        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.grads = []
-        self.pre_or_post = config.pre_layer_norm
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
-        all_encoder_layers = []
-
-        def custom(start, end):
-            def custom_forward(*inputs):
-                layers = self.layer[start:end]
-                x_ = inputs[0]
-                for layer in layers:
-                    x_ = layer(x_, inputs[1])
-                return x_
-
-            return custom_forward
-
-        if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
-            # decoder layers
-        else:
-            for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states,
-                                             attention_mask,
-                                             grads=self.grads)
-                hidden_states.register_hook(
-                    lambda x,
-                    self=self: self.grads.append([x,
-                                                  "hidden_state"]))
-
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-
-        if not output_all_encoded_layers or checkpoint_activations:
-            if (self.pre_or_post):
-                hidden_states = self.FinalLayerNorm(hidden_states)
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-    def get_grads(self):
-        return self.grads
-
-
-def create_models(ds_config):
-    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
-                             hidden_size=ds_config.hidden_size,
-                             num_hidden_layers=ds_config.num_hidden_layers,
-                             num_attention_heads=ds_config.heads,
-                             intermediate_size=ds_config.intermediate_size,
-                             hidden_act="gelu",
-                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
-                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=512,
-                             type_vocab_size=2,
-                             initializer_range=ds_config.initializer_range)
-
-    weights = []
-    biases = []
-
-    for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
-        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
-
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
-    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
-    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[7].data.fill_(1.0)
-
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[0].data.zero_()
-    for i in range(4):
-        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-        biases[i + 1].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
-    biases[5].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[6].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[7].data.zero_()
-
-    if (ds_config.pre_layer_norm):
-        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
-    else:
-        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
-    ds_encoder = DSEncoder(ds_config, weights, biases)
-
-    if ds_config.fp16:
-        bert_encoder.half()
-        ds_encoder.half()
-
-    bert_encoder.cuda()
-    ds_encoder.cuda()
-
-    return bert_encoder, ds_encoder
-
-
-def set_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-
-def run_backward(ds_config, seq_len, atol=1e-2, verbose=False):
-    set_seed(123)
-    bert_encoder, ds_encoder = create_models(ds_config)
-
-    # prepare test data
-    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(ds_config.batch_size,
-                                seq_len,
-                                ds_config.hidden_size,
-                                **kwargs)
-    input_mask = torch.randn(ds_config.batch_size, 1, 1, seq_len, **kwargs)
-    Y = torch.randn(ds_config.batch_size, seq_len, ds_config.hidden_size, **kwargs)
-
-    # run baseline
-    base_results = bert_encoder(hidden_states,
-                                input_mask,
-                                output_all_encoded_layers=False,
-                                checkpoint_activations=False)
-
-    loss = (Y - base_results[0]).pow(2).sum() / 64
-    loss.backward()
-    base_grads = bert_encoder.get_grads()
-
-    # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
-
-    loss = (Y - ds_results[0]).pow(2).sum() / 64
-    loss.backward()
-    ds_grads = ds_encoder.get_grads()
-
-    # check grads
-    check_equal(base_grads, ds_grads, atol=atol, verbose=verbose)
-
-
-#test_backward[3-1024-120-16-24-True-True-0.05]
-#test_backward[3-1024-52-16-24-False-True-0.2]
-# 3-128-54-2-24-False-True-0.2
-@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
-                         [
-                             (64,160,128,2,24,False,True, 0.2),
-                             (64,1600,128,2,4,False,True, 0.2),
-                             (8,1600,128,25,3,True,True, 0.05),
-                             (8,160,128,2,3,True,True, 0.1),
-                             (8,1600,128,2,3,True,True, 0.05),
-                             #(3,1024,119,16,24,True,False, 0.05),
-                             #(3,1024,115,16,24,True,True, 0.05),
-                             #(1024,128,10,2,2,False,False, 0.1),
-                             #(3,1024,52,16,24,False,True, 0.2),
-                             #(3,128,51,2,24,False,False, 0.1),
-                             #(3,128,54,2,24,False,True, 0.2),
-                         ]) # yapf: disable
-def test_backward(batch_size,
-                  hidden_size,
-                  seq_len,
-                  heads,
-                  num_layers,
-                  is_preln,
-                  use_fp16,
-                  atol):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and (use_fp16 is True or is_preln is False):
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.intermediate_size = hidden_size
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-
-    run_backward(ds_config, seq_len, atol=atol, verbose=True)
-
-
-#@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16, atol',
-#                         [
-#                             (3,1024,128,16,24,True,False, 0.07),
-#                             (3,1024,128,16,24,True,True, 0.05),
-#                             (3,1024,128,16,24,False,False, 0.1),
-#                             (3,1024,128,16,24,False,True, 0.2),
-#                         ]) # yapf: disable
-#def test_backward_stochastic(batch_size,
-#                             hidden_size,
-#                             seq_len,
-#                             heads,
-#                             num_layers,
-#                             is_preln,
-#                             use_fp16,
-#                             atol):
-#    # Only run fp16 test cases on devices with 7+ capability.
-#    major, _ = torch.cuda.get_device_capability()
-#    if major < 7 and (use_fp16 is True or is_preln is False):
-#        return
-#
-#    ds_config = DeepSpeedTransformerConfig()
-#    ds_config.layer_id = None
-#    ds_config.batch_size = batch_size
-#    ds_config.hidden_size = hidden_size
-#    ds_config.intermediate_size = 4 * hidden_size
-#    ds_config.max_seq_length = seq_len
-#    ds_config.heads = heads
-#    ds_config.attn_dropout_ratio = 0.0
-#    ds_config.hidden_dropout_ratio = 0.0
-#    ds_config.num_hidden_layers = num_layers
-#    ds_config.pre_layer_norm = is_preln
-#    ds_config.initializer_range = 0.02
-#    ds_config.fp16 = use_fp16
-#    ds_config.stochastic_mode = True
-#
-#    run_backward(ds_config, atol=atol)
diff --git a/tests/unit/test_cuda_forward.py b/tests/unit/test_cuda_forward.py
deleted file mode 100644
index 2a5d2d13858e01af6bc05886baa67f4ba778dbce..0000000000000000000000000000000000000000
--- a/tests/unit/test_cuda_forward.py
+++ /dev/null
@@ -1,332 +0,0 @@
-import argparse
-import numpy as np
-import torch
-import torch.nn.functional as F
-import pytest
-import json
-import random
-import time
-import copy
-from torch import nn
-from .modelingpreln import BertEncoder as BertEncoderPreln
-from .modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln
-from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-import deepspeed
-
-import sys
-
-
-def check_equal(first, second, atol=1e-2, verbose=False):
-    if verbose:
-        print()
-    for i, (x, y) in enumerate(zip(first, second)):
-        x = x[0].cpu().detach().numpy()
-        y = y[0].cpu().detach().numpy()
-        if verbose:
-            print("x = {}".format(x.flatten()))
-            print("y = {}".format(y.flatten()))
-            print('-' * 80)
-        np.testing.assert_allclose(x, y, err_msg="Index: {}".format(i), atol=atol)
-
-
-def zero_grad(variables):
-    for variable in variables:
-        variable.grad.zero_()
-
-
-device = torch.device("cuda")
-kwargs_fp32 = {'dtype': torch.float, 'device': device, 'requires_grad': True}
-kwargs_fp16 = {'dtype': torch.half, 'device': device, 'requires_grad': True}
-
-
-class DSEncoder(nn.Module):
-    def __init__(self, config, weights, biases):
-        super(DSEncoder, self).__init__()
-        self.FinalLayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.layer = nn.ModuleList([
-            copy.deepcopy(DeepSpeedTransformerLayer(config,
-                                                    weights,
-                                                    biases))
-            for _ in range(config.num_hidden_layers)
-        ])
-        self.grads = []
-        self.pre_or_post = config.pre_layer_norm
-
-    def forward(self,
-                hidden_states,
-                attention_mask,
-                output_all_encoded_layers=True,
-                checkpoint_activations=False):
-        all_encoder_layers = []
-
-        def custom(start, end):
-            def custom_forward(*inputs):
-                layers = self.layer[start:end]
-                x_ = inputs[0]
-                for layer in layers:
-                    x_ = layer(x_, inputs[1])
-                return x_
-
-            return custom_forward
-
-        if checkpoint_activations:
-            l = 0
-            num_layers = len(self.layer)
-            chunk_length = math.ceil(math.sqrt(num_layers))
-            while l < num_layers:
-                hidden_states = checkpoint.checkpoint(custom(l,
-                                                             l + chunk_length),
-                                                      hidden_states,
-                                                      attention_mask * 1)
-                l += chunk_length
-            # decoder layers
-        else:
-            for i, layer_module in enumerate(self.layer):
-                hidden_states = layer_module(hidden_states, attention_mask)
-
-                if output_all_encoded_layers:
-                    all_encoder_layers.append(hidden_states)
-
-        if not output_all_encoded_layers or checkpoint_activations:
-            if (self.pre_or_post):
-                hidden_states = self.FinalLayerNorm(hidden_states)
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-
-def create_models(ds_config):
-    bert_config = BertConfig(vocab_size_or_config_json_file=119547,
-                             hidden_size=ds_config.hidden_size,
-                             num_hidden_layers=ds_config.num_hidden_layers,
-                             num_attention_heads=ds_config.heads,
-                             batch_size=ds_config.batch_size,
-                             intermediate_size=ds_config.intermediate_size,
-                             hidden_act="gelu",
-                             hidden_dropout_prob=ds_config.hidden_dropout_ratio,
-                             attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
-                             max_position_embeddings=512,
-                             type_vocab_size=2,
-                             initializer_range=ds_config.initializer_range,
-                             fp16=ds_config.fp16)
-
-    weights = []
-    biases = []
-
-    for i in range(4):
-        weights.append(
-            nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                      ds_config.hidden_size)))
-        weights[i].data.normal_(mean=0.0, std=ds_config.initializer_range)
-
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[4].data.fill_(1.0)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.intermediate_size,
-                                  ds_config.hidden_size)))
-    weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(
-        nn.Parameter(torch.Tensor(ds_config.hidden_size,
-                                  ds_config.intermediate_size)))
-    weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
-    weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    weights[7].data.fill_(1.0)
-
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[0].data.zero_()
-    for i in range(4):
-        biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-        biases[i + 1].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
-    biases[5].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[6].data.zero_()
-    biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
-    biases[7].data.zero_()
-
-    if (ds_config.pre_layer_norm):
-        bert_encoder = BertEncoderPreln(bert_config, weights, biases)
-    else:
-        bert_encoder = BertEncoderPostln(bert_config, weights, biases)
-    ds_encoder = DSEncoder(ds_config, weights, biases)
-
-    if ds_config.fp16:
-        bert_encoder.half()
-        ds_encoder.half()
-
-    bert_encoder.cuda()
-    ds_encoder.cuda()
-
-    return bert_encoder, ds_encoder
-
-
-def set_seed(seed):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-
-def run_forward(ds_config, seq_len, atol=1e-2, verbose=False, test_bsz=None):
-    set_seed(123)
-    bert_encoder, ds_encoder = create_models(ds_config)
-
-    bsz = ds_config.batch_size if test_bsz is None else test_bsz
-
-    # prepare test data
-    kwargs = kwargs_fp16 if ds_config.fp16 else kwargs_fp32
-    hidden_states = torch.randn(bsz, seq_len, ds_config.hidden_size, **kwargs)
-    input_mask = torch.randn(bsz, 1, 1, seq_len, **kwargs)
-
-    # run baseline
-    base_results = bert_encoder(hidden_states,
-                                input_mask,
-                                output_all_encoded_layers=False,
-                                checkpoint_activations=False)
-
-    # run ds
-    ds_results = ds_encoder(hidden_states,
-                            input_mask,
-                            output_all_encoded_layers=False,
-                            checkpoint_activations=False)
-
-    # check forward evaluation
-    check_equal(base_results, ds_results, atol=atol, verbose=verbose)
-
-
-# FP16 test cases can only run on the devices support FP16.
-@pytest.mark.sequential
-@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             (64,160,128,2,24,False,True),
-                             #(8,2048,2048,32,1,True,True),
-                             (8,160,128,2,3,True,True),
-                             (8,160,128,2,3,False,True),
-                             (8,1600,128,2,3,True,True),
-                             (8,1600,128,25,3,True,True),
-                             (8,1600,128,25,3,False,True),
-                             (8,256,52,4,3,True,True),
-                             (3,1024,51,16,3,True,False),
-                             (3,1024,54,16,3,True,True),
-                             (8,1024,381,16,3,True,False),
-                             (8,1024,384,16,3,True,True),
-                             (8,1024,384,16,3,True,True),
-                             (8,1024,119,16,3,True,False),
-                             (8,1024,120,16,3,True,True),
-                             (8,1024,509,16,3,True,False),
-                             (8,1024,512,16,3,True,True),
-                             (64,1024,56,16,3,False,False),
-                             (64,1024,53,16,3,False,True),
-                             (64,1024,24,16,3,False,False),
-                             (64,1024,21,16,3,False,True),
-                             (8,1024,384,16,3,False,False),
-                             (8,1024,384,16,3,False,True),
-                             (8,1024,512,16,3,False,False),
-                             (8,1024,511,16,3,False,True),
-                             (8,1536,128,24,3,False,False),
-                             (8,1536,128,24,3,False,True),
-                             (8,2048,128,32,3,False,False),
-                             (8,2048,128,32,3,False,True),
-                             (8,2560,128,40,3,False,False),
-                             (8,2560,128,40,3,False,True),
-                             (8,128,128,2,3,True,False),
-                             (8,128,128,2,3,True,True),
-                             (8,4096,128,64,3,True,True),
-                             (8,8192,128,64,3,False,True),
-                             (1,256,2048,32,3,True,True),
-                         ]) # yapf: disable
-def test_forward(batch_size,
-                 hidden_size,
-                 seq_len,
-                 heads,
-                 num_layers,
-                 is_preln,
-                 use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.intermediate_size = 4 * hidden_size
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-
-    run_forward(ds_config, seq_len, atol=3e-2)
-
-
-@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             (8,3,1024,512,16,3,True,False),
-                             (8,7,1024,512,16,3,True,True),
-                             (8,3,1024,512,16,3,False,False),
-                             (8,7,1024,512,16,3,False,True),
-                         ]) # yapf: disable
-def test_forward_with_small_bsz(batch_size,
-                                small_bsz,
-                                hidden_size,
-                                seq_len,
-                                heads,
-                                num_layers,
-                                is_preln,
-                                use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.intermediate_size = 4 * hidden_size
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-
-    run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)
-
-@pytest.mark.parametrize('batch_size, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
-                         [
-                             #(64,1024,128,16,3,True,False),
-                             #(64,1024,128,16,3,True,True),
-                             #(64,1024,128,16,3,False,False),
-                             #(64,1024,128,16,3,False,True),
-                         ]) # yapf: disable
-def test_forward_stochastic(batch_size,
-                            hidden_size,
-                            seq_len,
-                            heads,
-                            num_layers,
-                            is_preln,
-                            use_fp16):
-    # Only run fp16 test cases on devices with 7+ capability.
-    major, _ = torch.cuda.get_device_capability()
-    if major < 7 and use_fp16 is True:
-        return
-
-    ds_config = DeepSpeedTransformerConfig()
-    ds_config.layer_id = None
-    ds_config.batch_size = batch_size
-    ds_config.hidden_size = hidden_size
-    ds_config.intermediate_size = 4 * hidden_size
-    ds_config.heads = heads
-    ds_config.attn_dropout_ratio = 0.0
-    ds_config.hidden_dropout_ratio = 0.0
-    ds_config.num_hidden_layers = num_layers
-    ds_config.pre_layer_norm = is_preln
-    ds_config.initializer_range = 0.02
-    ds_config.fp16 = use_fp16
-    ds_config.stochastic_mode = True
-
-    run_forward(ds_config, seq_len, atol=7e-2)
diff --git a/tests/unit/test_curriculum_learning.py b/tests/unit/test_curriculum_learning.py
deleted file mode 100644
index 3677b5966781a78b2f7fc86451299574fe519a5f..0000000000000000000000000000000000000000
--- a/tests/unit/test_curriculum_learning.py
+++ /dev/null
@@ -1,133 +0,0 @@
-import torch
-import torch.distributed as dist
-import deepspeed
-import argparse
-import pytest
-import json
-import os
-import numpy as np
-import time
-from .common import distributed_test
-from .simple_model import Curriculum_SimpleModel, random_dataloader, args_from_dict
-
-
-def test_curriculum_scheduler_fixed_discrete(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        },
-        "curriculum_learning": {
-            "enabled": True,
-            "curriculum_type": "seqlen",
-            "min_difficulty": 1,
-            "max_difficulty": 5,
-            "schedule_type": "fixed_discrete",
-            "schedule_config": {
-                "difficulty": [1,
-                               2,
-                               3,
-                               4,
-                               5],
-                "max_step": [2,
-                             4,
-                             6,
-                             8]
-            }
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-    ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
-    model = Curriculum_SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_curriculum_scheduler_fixed_discrete(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss, seqlen = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            true_seqlen = 5
-            if n + 1 in ground_truths:
-                true_seqlen = ground_truths[n + 1]
-            print('at step {} the seqlen is {}'.format(n + 1, seqlen))
-            assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
-
-    _test_curriculum_scheduler_fixed_discrete(args=args,
-                                              model=model,
-                                              hidden_dim=hidden_dim)
-
-
-def test_curriculum_scheduler_fixed_linear(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        },
-        "curriculum_learning": {
-            "enabled": True,
-            "curriculum_type": "seqlen",
-            "min_difficulty": 2,
-            "max_difficulty": 10,
-            "schedule_type": "fixed_linear",
-            "schedule_config": {
-                "total_curriculum_step": 8,
-                "difficulty_step": 2
-            }
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-    ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
-    model = Curriculum_SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_curriculum_scheduler_fixed_linear(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=20,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss, seqlen = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            if n + 1 in ground_truths:
-                true_seqlen = ground_truths[n + 1]
-                print('at step {} the seqlen is {}'.format(n + 1, seqlen))
-                assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
-
-    _test_curriculum_scheduler_fixed_linear(args=args,
-                                            model=model,
-                                            hidden_dim=hidden_dim)
diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py
deleted file mode 100644
index 93510e5574508c5ce6d191dc3ce8710d4c963bc8..0000000000000000000000000000000000000000
--- a/tests/unit/test_data.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from deepspeed.utils import RepeatingLoader
-import torch
-import pytest
-import deepspeed
-from .common import distributed_test
-from .simple_model import SimpleModel, args_from_dict, random_dataset
-
-
-def test_repeating_loader():
-    loader = [1, 2, 3]
-    loader = RepeatingLoader(loader)
-
-    for idx in range(50):
-        assert next(loader) == 1
-        assert next(loader) == 2
-        assert next(loader) == 3
-
-
-@pytest.mark.parametrize('train_batch_size, drop_last',
-                         [(1,
-                           True),
-                          (4,
-                           True),
-                          (1,
-                           False),
-                          (4,
-                           False)])
-def test_dataloader_drop_last(tmpdir, train_batch_size, drop_last):
-    config_dict = {
-        "train_batch_size": train_batch_size,
-        "dataloader_drop_last": drop_last,
-        "steps_per_print": 1
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_dataloader_drop_last(args, model, hidden_dim):
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        #TODO: Figure out why this breaks with cuda device
-        train_dataset = random_dataset(total_samples=50,
-                                       hidden_dim=hidden_dim,
-                                       device=torch.device('cpu'),
-                                       dtype=torch.float32)
-        model, _, training_dataloader, _ = deepspeed.initialize(args=args,
-                                                                model=model,
-                                                                training_data=train_dataset,
-                                                                optimizer=optimizer)
-        for n, batch in enumerate(training_dataloader):
-            x = batch[0].to(torch.cuda.current_device())
-            y = batch[1].to(torch.cuda.current_device())
-            loss = model(x, y)
-            model.backward(loss)
-            model.step()
-
-    _test_dataloader_drop_last(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_dist.py b/tests/unit/test_dist.py
deleted file mode 100644
index d37133603ce46a6977e73dd8be2b215c45dd7763..0000000000000000000000000000000000000000
--- a/tests/unit/test_dist.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import torch
-import torch.distributed as dist
-
-from .common import distributed_test
-
-import pytest
-
-
-@distributed_test(world_size=3)
-def test_init():
-    assert dist.is_initialized()
-    assert dist.get_world_size() == 3
-    assert dist.get_rank() < 3
-
-
-# Demonstration of pytest's parameterization
-@pytest.mark.parametrize('number,color', [(1138, 'purple')])
-def test_dist_args(number, color):
-    """Outer test function with inputs from pytest.mark.parametrize(). Uses a distributed
-    helper function.
-    """
-    @distributed_test(world_size=2)
-    def _test_dist_args_helper(x, color='red'):
-        assert dist.get_world_size() == 2
-        assert x == 1138
-        assert color == 'purple'
-
-    """Ensure that we can parse args to distributed_test decorated functions. """
-    _test_dist_args_helper(number, color=color)
-
-
-@distributed_test(world_size=[1, 2, 4])
-def test_dist_allreduce():
-    x = torch.ones(1, 3).cuda() * (dist.get_rank() + 1)
-    sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
-    result = torch.ones(1, 3).cuda() * sum_of_ranks
-    dist.all_reduce(x)
-    assert torch.all(x == result)
diff --git a/tests/unit/test_ds_arguments.py b/tests/unit/test_ds_arguments.py
deleted file mode 100644
index a88a45d1bf46e3a96bd707a72bcc57549948ee86..0000000000000000000000000000000000000000
--- a/tests/unit/test_ds_arguments.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import argparse
-import pytest
-import deepspeed
-
-
-def basic_parser():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num_epochs', type=int)
-    return parser
-
-
-def test_no_ds_arguments_no_ds_parser():
-    parser = basic_parser()
-    args = parser.parse_args(['--num_epochs', '2'])
-    assert args
-
-    assert hasattr(args, 'num_epochs')
-    assert args.num_epochs == 2
-
-    assert not hasattr(args, 'deepspeed')
-    assert not hasattr(args, 'deepspeed_config')
-
-
-def test_no_ds_arguments():
-    parser = basic_parser()
-    parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(['--num_epochs', '2'])
-    assert args
-
-    assert hasattr(args, 'num_epochs')
-    assert args.num_epochs == 2
-
-    assert hasattr(args, 'deepspeed')
-    assert args.deepspeed == False
-
-    assert hasattr(args, 'deepspeed_config')
-    assert args.deepspeed_config == None
-
-
-def test_no_ds_enable_argument():
-    parser = basic_parser()
-    parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(['--num_epochs', '2', '--deepspeed_config', 'foo.json'])
-    assert args
-
-    assert hasattr(args, 'num_epochs')
-    assert args.num_epochs == 2
-
-    assert hasattr(args, 'deepspeed')
-    assert args.deepspeed == False
-
-    assert hasattr(args, 'deepspeed_config')
-    assert type(args.deepspeed_config) == str
-    assert args.deepspeed_config == 'foo.json'
-
-
-def test_no_ds_config_argument():
-    parser = basic_parser()
-    parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(['--num_epochs', '2', '--deepspeed'])
-    assert args
-
-    assert hasattr(args, 'num_epochs')
-    assert args.num_epochs == 2
-
-    assert hasattr(args, 'deepspeed')
-    assert type(args.deepspeed) == bool
-    assert args.deepspeed == True
-
-    assert hasattr(args, 'deepspeed_config')
-    assert args.deepspeed_config == None
-
-
-def test_no_ds_parser():
-    parser = basic_parser()
-    with pytest.raises(SystemExit):
-        args = parser.parse_args(['--num_epochs', '2', '--deepspeed'])
-
-
-def test_core_deepscale_arguments():
-    parser = basic_parser()
-    parser = deepspeed.add_config_arguments(parser)
-    args = parser.parse_args(
-        ['--num_epochs',
-         '2',
-         '--deepspeed',
-         '--deepspeed_config',
-         'foo.json'])
-    assert args
-
-    assert hasattr(args, 'num_epochs')
-    assert args.num_epochs == 2
-
-    assert hasattr(args, 'deepspeed')
-    assert type(args.deepspeed) == bool
-    assert args.deepspeed == True
-
-    assert hasattr(args, 'deepspeed_config')
-    assert type(args.deepspeed_config) == str
-    assert args.deepspeed_config == 'foo.json'
diff --git a/tests/unit/test_ds_config.py b/tests/unit/test_ds_config.py
deleted file mode 100644
index 728a46bbbb1b266eabcdb712642895dc5fb3d471..0000000000000000000000000000000000000000
--- a/tests/unit/test_ds_config.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import pytest
-import os
-import json
-from deepspeed.runtime import config as ds_config
-
-
-def test_only_required_fields(tmpdir):
-    '''Ensure that config containing only the required fields is accepted. '''
-    cfg_json = tmpdir.mkdir('ds_config_unit_test').join('minimal.json')
-
-    with open(cfg_json, 'w') as f:
-        required_fields = {'train_batch_size': 64}
-        json.dump(required_fields, f)
-
-    run_cfg = ds_config.DeepSpeedConfig(cfg_json)
-    assert run_cfg is not None
-    assert run_cfg.train_batch_size == 64
-    assert run_cfg.train_micro_batch_size_per_gpu == 64
-    assert run_cfg.gradient_accumulation_steps == 1
-
-
-def test_config_duplicate_key(tmpdir):
-    config_dict = '''
-    {
-        "train_batch_size": 24,
-        "train_batch_size": 24,
-    }
-    '''
-    config_path = os.path.join(tmpdir, 'temp_config.json')
-
-    with open(config_path, 'w') as jf:
-        jf.write("%s" % config_dict)
-
-    with pytest.raises(ValueError):
-        run_cfg = ds_config.DeepSpeedConfig(config_path)
diff --git a/tests/unit/test_ds_initialize.py b/tests/unit/test_ds_initialize.py
deleted file mode 100644
index a9756af6220007b16e845b479a7dc9a8ddab96f6..0000000000000000000000000000000000000000
--- a/tests/unit/test_ds_initialize.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import pytest
-from typing import Callable
-import torch
-from torch.optim import Optimizer, Adam, AdamW
-from torch.optim.lr_scheduler import _LRScheduler, LambdaLR
-
-from .simple_model import args_from_dict, SimpleModel, random_dataloader
-from .common import distributed_test
-from .util import required_torch_version
-
-import deepspeed
-from deepspeed.ops.adam import FusedAdam
-from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
-from deepspeed.runtime.config import ADAM_OPTIMIZER
-from deepspeed.runtime.utils import see_memory_usage
-
-
-@pytest.mark.parametrize('zero_stage,world_size', [(0, 1), (3, 1)])
-def test_no_optim(zero_stage, world_size):
-    if zero_stage == 3 and not required_torch_version():
-        pytest.skip("zero-3 param offload requires at least torch 1.8")
-
-    ds_config = {
-        'train_batch_size': world_size,
-        'fp16': {
-            'enabled': True
-        },
-        'zero_optimization': {
-            "stage": zero_stage,
-            "offload_param": {
-                "device": "cpu"
-            }
-        }
-    }
-    # 20B test
-    #hidden_dim = 16 * 1024
-    hidden_dim = 4
-
-    @distributed_test(world_size=[world_size])
-    def _go(hidden_dim):
-        with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config):
-            model = SimpleModel(hidden_dim, nlayers=78)
-        print('total number of parameters:',
-              sum([p.numel() for p in model.parameters()]))
-        see_memory_usage('pre-init', force=True)
-        model, _, _, _ = deepspeed.initialize(model=model, config=ds_config)
-        see_memory_usage('post-init', force=True)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.half)
-        print(f"optimizer={model.optimizer}")
-        for batch in data_loader:
-            model(batch[0], batch[1])
-        see_memory_usage('post-fwds', force=True)
-
-    _go(hidden_dim)
-
-
-@pytest.mark.parametrize('optimizer_type', [None, Optimizer, Callable])
-def test_client_optimizer(tmpdir, optimizer_type):
-    def _optimizer_callable(params) -> Optimizer:
-        return AdamW(params=params)
-
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-
-    config_dict = {'train_batch_size': 1}
-    if optimizer_type is None:
-        client_optimizer = None
-        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
-    elif optimizer_type is Optimizer:
-        client_optimizer = Adam(model.parameters())
-    else:
-        client_optimizer = _optimizer_callable
-
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_client_optimizer(args, model, client_optimizer):
-        _, ds_optimizer, _, _ = deepspeed.initialize(args=args,
-                                                    model=model,
-                                                    model_parameters=list(model.parameters()),
-                                                    optimizer=client_optimizer)
-        if client_optimizer is None:
-            assert isinstance(ds_optimizer, FusedAdam)
-        elif isinstance(client_optimizer, Optimizer):
-            assert ds_optimizer == client_optimizer
-        else:
-            assert isinstance(ds_optimizer, AdamW)
-
-    _test_client_optimizer(args=args, model=model, client_optimizer=client_optimizer)
-
-
-@pytest.mark.parametrize('scheduler_type, optimizer_type',
-                         [(None,
-                           None),
-                          (None,
-                           Optimizer),
-                          (None,
-                           Callable),
-                          (_LRScheduler,
-                           None),
-                          (_LRScheduler,
-                           Optimizer),
-                          (_LRScheduler,
-                           Callable),
-                          (Callable,
-                           None),
-                          (Callable,
-                           Optimizer),
-                          (Callable,
-                           Callable)])
-def test_client_lr_scheduler(tmpdir, scheduler_type, optimizer_type):
-    def _my_lambda(epoch):
-        return epoch // 10
-
-    def _optimizer_callable(params) -> Optimizer:
-        return torch.optim.AdamW(params=params)
-
-    def _lr_scheduler_callable(optimizer) -> _LRScheduler:
-        return LambdaLR(optimizer, _my_lambda)
-
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-
-    config_dict = {'train_batch_size': 1}
-
-    client_optimizer = None
-    client_scheduler = None
-
-    if optimizer_type is None:
-        config_dict['optimizer'] = {'type': ADAM_OPTIMIZER}
-    elif optimizer_type is Optimizer:
-        client_optimizer = torch.optim.Adam(model.parameters())
-    else:
-        client_optimizer = _optimizer_callable
-
-    if scheduler_type is None:
-        config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
-    elif scheduler_type == _LRScheduler:
-        if isinstance(client_optimizer, Optimizer):
-            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
-        else:
-            # Verify invalid combination is correctly handled
-            client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda)
-    else:
-        client_scheduler = _lr_scheduler_callable
-
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_client_lr_scheduler(args, model, optimizer, lr_scheduler):
-        if isinstance(lr_scheduler,
-                      _LRScheduler) and not isinstance(optimizer,
-                                                       Optimizer):
-            with pytest.raises(AssertionError):
-                _, _, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=list(model.parameters()),
-                                                  optimizer=optimizer,
-                                                  lr_scheduler=lr_scheduler)
-        else:
-            _, _, _, ds_lr_scheduler = deepspeed.initialize(args=args,
-                                                            model=model,
-                                                            model_parameters=list(model.parameters()),
-                                                            optimizer=optimizer,
-                                                            lr_scheduler=lr_scheduler)
-            if lr_scheduler is None:
-                assert isinstance(ds_lr_scheduler, WarmupLR)
-            elif isinstance(lr_scheduler, _LRScheduler):
-                assert ds_lr_scheduler == lr_scheduler
-            else:
-                assert isinstance(ds_lr_scheduler, LambdaLR)
-
-    _test_client_lr_scheduler(args=args,
-                              model=model,
-                              optimizer=client_optimizer,
-                              lr_scheduler=client_scheduler)
diff --git a/tests/unit/test_dynamic_loss_scale.py b/tests/unit/test_dynamic_loss_scale.py
deleted file mode 100644
index 65a679d94de740adda2fb7972cac4ac3de93d8c0..0000000000000000000000000000000000000000
--- a/tests/unit/test_dynamic_loss_scale.py
+++ /dev/null
@@ -1,315 +0,0 @@
-import torch
-import deepspeed
-import argparse
-import pytest
-import json
-import os
-import numpy as np
-from .common import distributed_test
-from .simple_model import SimpleModel, args_from_dict
-
-
-def run_model_step(model, gradient_list):
-    for value in gradient_list:
-        for p in model.parameters():
-            p.grad = torch.empty_like(p, dtype=p.dtype)
-            p.grad.fill_(value)
-        model.step()
-
-
-def test_fused_no_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 8,
-            "loss_scale_window": 2
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_fused_no_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        expected_loss_scale = 2**8
-        expected_scale_window = 2
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.scale_window == expected_scale_window
-
-        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
-            run_model_step(model, [value])
-            assert optim.cur_scale == expected_loss_scale
-            assert optim.cur_iter == (i + 1)
-            if optim.cur_iter % expected_scale_window == 0:
-                expected_loss_scale *= 2
-
-    _test_fused_no_overflow(args)
-
-
-def test_fused_all_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 4,
-            "loss_scale_window": 2
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_fused_all_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        expected_loss_scale = 2**4
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-
-        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
-        for i, value in enumerate(overflow_gradients):
-            run_model_step(model, [value])
-            expected_loss_scale = max(expected_loss_scale / 2, 1)
-            assert optim.cur_scale == expected_loss_scale
-            assert optim.cur_iter == (i + 1)
-
-    _test_fused_all_overflow(args)
-
-
-def test_fused_some_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 8,
-            "loss_scale_window": 2
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_fused_some_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        expected_loss_scale = 2**8
-        expected_scale_window = 2
-        expected_iteration = 0
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.scale_window == expected_scale_window
-
-        # Run model with overflows to decrease scale
-        overflow_gradients = [float('inf'), float('nan')]
-        expected_iteration += len(overflow_gradients)
-        run_model_step(model, overflow_gradients)
-        expected_loss_scale /= (2**len(overflow_gradients))
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-        # Run model scale_window + 1 times to increase scale once
-        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
-        expected_iteration += len(normal_gradients)
-        run_model_step(model, normal_gradients)
-        expected_loss_scale *= 2
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-        # Run model with overflows to decrease scale
-        overflow_gradients = [float('inf')]
-        expected_iteration += len(overflow_gradients)
-        run_model_step(model, overflow_gradients)
-        expected_loss_scale /= (2**len(overflow_gradients))
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-    _test_fused_some_overflow(args)
-
-
-def test_unfused_no_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 8,
-            "loss_scale_window": 2
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_unfused_no_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-        expected_loss_scale = 2**8
-        expected_scale_window = 2
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.scale_window == expected_scale_window
-
-        for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)):
-            run_model_step(model, [value])
-            assert optim.cur_scale == expected_loss_scale
-            assert optim.cur_iter == (i + 1)
-            if optim.cur_iter % expected_scale_window == 0:
-                expected_loss_scale *= 2
-
-    _test_unfused_no_overflow(args)
-
-
-def test_unfused_all_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 4,
-            "loss_scale_window": 2,
-            "min_loss_scale": 0.25
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_unfused_all_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        expected_loss_scale = 2**4
-        expected_min_loss_scale = 0.25
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.min_loss_scale == expected_min_loss_scale
-
-        overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6
-        for i, value in enumerate(overflow_gradients):
-            run_model_step(model, [value])
-            expected_loss_scale = max(expected_loss_scale / 2, expected_min_loss_scale)
-            assert optim.cur_scale == expected_loss_scale
-            assert optim.cur_iter == (i + 1)
-
-    _test_unfused_all_overflow(args)
-
-
-def test_unfused_some_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 8,
-            "loss_scale_window": 2
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=1)
-    def _test_unfused_some_overflow(args):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        expected_loss_scale = 2**8
-        expected_scale_window = 2
-        expected_iteration = 0
-        # Ensure the dynamic loss scaler is correctly configured.
-        assert optim.dynamic_loss_scale == True
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.scale_window == expected_scale_window
-
-        # Run model with overflows to decrease scale
-        overflow_gradients = [float('inf'), float('nan')]
-        expected_iteration += len(overflow_gradients)
-        run_model_step(model, overflow_gradients)
-        expected_loss_scale /= (2**len(overflow_gradients))
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-        # Run model scale_window + 1 times to increase scale once
-        normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1)
-        expected_iteration += len(normal_gradients)
-        run_model_step(model, normal_gradients)
-        expected_loss_scale *= 2
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-        # Run model with overflows to decrease scale
-        overflow_gradients = [float('inf')]
-        expected_iteration += len(overflow_gradients)
-        run_model_step(model, overflow_gradients)
-        expected_loss_scale /= (2**len(overflow_gradients))
-        assert optim.cur_scale == expected_loss_scale
-        assert optim.cur_iter == expected_iteration
-
-    _test_unfused_some_overflow(args)
diff --git a/tests/unit/test_elastic.py b/tests/unit/test_elastic.py
deleted file mode 100644
index 353d6def37ba40054d64e42e5219318f6958f396..0000000000000000000000000000000000000000
--- a/tests/unit/test_elastic.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import pytest
-import deepspeed
-from .common import distributed_test
-from deepspeed.git_version_info import version as ds_version
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-
-base_ds_config = {
-    "elasticity": {
-        "enabled": True,
-        "max_train_batch_size": 10000,
-        "micro_batch_sizes": [8,
-                              12,
-                              16,
-                              17],
-        "min_gpus": 32,
-        "max_gpus": 1500,
-        "min_time": 20,
-        "version": 0.1
-    }
-}
-
-
-def test_basic_10k():
-    ds_config = base_ds_config.copy()
-    final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version)
-
-    for gpu_num in valid_gpus:
-        assert final_batch_size % gpu_num == 0, f"Batch {final_batch_size} is not divisible by GPU count {gpu_num}"
-        batch_per_gpu = final_batch_size // gpu_num
-        found_valid_mbsize = False
-
-        for mb in ds_config['elasticity']['micro_batch_sizes']:
-            if batch_per_gpu % mb == 0:
-                found_valid_mb = True
-                break
-        assert found_valid_mb, "No valid mb found"
-
-    assert len(valid_gpus) == 23
-    assert final_batch_size == 9792
-
-
-def test_old_version():
-    ds_config = base_ds_config.copy()
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version="0.2")
-
-
-def test_disabled():
-    ds_config = base_ds_config.copy()
-    ds_config['elasticity']['enabled'] = False
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        final_batch_size, valid_gpus = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version)
-
-
-def test_valid_world_size():
-    ds_config = base_ds_config.copy()
-    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=64)
-    assert mbsize == 17
-
-
-def test_invalid_world_size():
-    ds_config = base_ds_config.copy()
-    with pytest.raises(deepspeed.elasticity.config.ElasticityIncompatibleWorldSize):
-        final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-            ds_config=ds_config,
-            target_deepspeed_version=ds_version,
-            world_size=128)
-
-
-def test_future_elastic_version():
-    ds_config = base_ds_config.copy()
-    ds_config['elasticity']['version'] = '0.2'
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
-
-
-def test_missing_max_batch():
-    ds_config = base_ds_config.copy()
-    del ds_config['elasticity']['max_train_batch_size']
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
-
-
-def test_missing_micro_batch():
-    ds_config = base_ds_config.copy()
-    del ds_config['elasticity']['micro_batch_sizes']
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
-
-
-def test_empty_config():
-    ds_config = {"elasticity": {"enabled": True}}
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
-
-
-@pytest.mark.parametrize('key, value',
-                         [('micro_batch_sizes',
-                           [1,
-                            4,
-                            -1,
-                            2,
-                            -10]),
-                          ('min_gpus',
-                           -1),
-                          ('max_gpus',
-                           -1),
-                          ('micro_batch_sizes',
-                           5),
-                          ('micro_batch_sizes',
-                           ['a',
-                            None,
-                            0.5]),
-                          ('micro_batch_sizes',
-                           [2,
-                            0.5,
-                            4])])
-def test_invalid_config_values(key, value):
-    ds_config = base_ds_config.copy()
-    ds_config['elasticity'][key] = value
-    with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-        deepspeed.elasticity.compute_elastic_config(ds_config=ds_config,
-                                                    target_deepspeed_version=ds_version)
-
-
-def test_proper_mbsz():
-    ds_config = base_ds_config.copy()
-    ds_config["elasticity"]["max_train_batch_size"] = 32
-    ds_config["elasticity"]["micro_batch_sizes"] = [1, 2, 3, 7]
-    ds_config["elasticity"]["min_gpus"] = 1
-    final_batch_size, valid_gpus, mbsize = deepspeed.elasticity.compute_elastic_config(
-        ds_config=ds_config,
-        target_deepspeed_version=ds_version,
-        world_size=7)
-    assert mbsize == 3
-
-
-def test_non_elastic_batch_params(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "elasticity": {
-            "enabled": True,
-            "max_train_batch_size": 4,
-            "micro_batch_sizes": [1,
-                                  2,
-                                  3,
-                                  4],
-            "min_gpus": 1,
-            "max_gpus": 4,
-            "min_time": 20,
-            "version": 0.1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_elastic(args, model, hidden_dim):
-        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(args=args,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
-
-    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_non_elastic_batch_params_w_override(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "elasticity": {
-            "enabled": True,
-            "max_train_batch_size": 4,
-            "micro_batch_sizes": [1,
-                                  2,
-                                  3,
-                                  4],
-            "min_gpus": 1,
-            "max_gpus": 4,
-            "min_time": 20,
-            "version": 0.1,
-            "ignore_non_elastic_batch_info": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_elastic(args, model, hidden_dim):
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-
-    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_elastic_config_changed(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "elasticity": {
-            "enabled": True,
-            "max_train_batch_size": 4,
-            "micro_batch_sizes": [1,
-                                  2,
-                                  3,
-                                  4],
-            "min_gpus": 1,
-            "max_gpus": 4,
-            "min_time": 20,
-            "version": 0.1,
-            "ignore_non_elastic_batch_info": True
-        }
-    }
-    import json, os
-    scheduler_elastic_config = config_dict.copy()
-    scheduler_elastic_config["elasticity"]["max_train_batch_size"] = 27
-    os.environ['DEEPSPEED_ELASTICITY_CONFIG'] = json.dumps(scheduler_elastic_config)
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_elastic(args, model, hidden_dim):
-        with pytest.raises(deepspeed.elasticity.config.ElasticityError):
-            model, _, _,_ = deepspeed.initialize(args=args,
-                                                 model=model,
-                                                 model_parameters=model.parameters())
-
-    _test_elastic(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_flops_profiler.py b/tests/unit/test_flops_profiler.py
deleted file mode 100644
index 173fa7eed09c403481a1a1afa6ec471fea5ccbb2..0000000000000000000000000000000000000000
--- a/tests/unit/test_flops_profiler.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import torch
-import pytest
-import deepspeed
-import deepspeed.runtime.utils as ds_utils
-from deepspeed.profiling.flops_profiler import FlopsProfiler, get_model_profile
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-from .common import distributed_test
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-pytestmark = pytest.mark.skipif(TORCH_MAJOR < 1
-                                or (TORCH_MAJOR == 1 and TORCH_MINOR < 3),
-                                reason='requires Pytorch version 1.3 or above')
-
-
-def within_range(val, target, tolerance):
-    return abs(val - target) / target < tolerance
-
-
-TOLERANCE = 0.05
-
-
-def test_flops_profiler_in_ds_training(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.001,
-            }
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "fp16": {
-            "enabled": True,
-        },
-        "flops_profiler": {
-            "enabled": True,
-            "step": 1,
-            "module_depth": -1,
-            "top_modules": 3,
-        },
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_flops_profiler_in_ds_training(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.half)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            if n == 3: break
-        assert within_range(model.flops_profiler.flops, 200, tolerance=TOLERANCE)
-        assert model.flops_profiler.params == 110
-
-    _test_flops_profiler_in_ds_training(args, model, hidden_dim)
-
-
-class LeNet5(torch.nn.Module):
-    def __init__(self, n_classes):
-        super(LeNet5, self).__init__()
-
-        self.feature_extractor = torch.nn.Sequential(
-            torch.nn.Conv2d(in_channels=1,
-                            out_channels=6,
-                            kernel_size=5,
-                            stride=1),
-            torch.nn.Tanh(),
-            torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=6,
-                            out_channels=16,
-                            kernel_size=5,
-                            stride=1),
-            torch.nn.Tanh(),
-            torch.nn.AvgPool2d(kernel_size=2),
-            torch.nn.Conv2d(in_channels=16,
-                            out_channels=120,
-                            kernel_size=5,
-                            stride=1),
-            torch.nn.Tanh(),
-        )
-
-        self.classifier = torch.nn.Sequential(
-            torch.nn.Linear(in_features=120,
-                            out_features=84),
-            torch.nn.Tanh(),
-            torch.nn.Linear(in_features=84,
-                            out_features=n_classes),
-        )
-
-    def forward(self, x):
-        x = self.feature_extractor(x)
-        x = torch.flatten(x, 1)
-        logits = self.classifier(x)
-        probs = torch.nn.functional.softmax(logits, dim=1)
-        return logits, probs
-
-
-def test_flops_profiler_in_inference():
-    mod = LeNet5(10)
-    batch_size = 1024
-    input = torch.randn(batch_size, 1, 32, 32)
-    flops, macs, params = get_model_profile(
-        mod,
-        tuple(input.shape),
-        print_profile=True,
-        detailed=True,
-        module_depth=-1,
-        top_modules=3,
-        warm_up=1,
-        as_string=False,
-        ignore_modules=None,
-    )
-    print(flops, macs, params)
-    assert within_range(flops, 866076672, TOLERANCE)
-    assert within_range(macs, 426516480, TOLERANCE)
-    assert params == 61706
diff --git a/tests/unit/test_fp16.py b/tests/unit/test_fp16.py
deleted file mode 100644
index 0cd258f590d1d6e0974ed96994a3db72dc658540..0000000000000000000000000000000000000000
--- a/tests/unit/test_fp16.py
+++ /dev/null
@@ -1,1049 +0,0 @@
-import math
-from deepspeed.utils import groups
-import torch
-import torch.distributed as dist
-import deepspeed
-import argparse
-import pytest
-import json
-import os
-from deepspeed.ops.adam import FusedAdam
-from .common import distributed_test
-from deepspeed.ops.op_builder import CPUAdamBuilder
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args, SimpleMoEModel, sequence_dataloader
-from .util import required_torch_version
-
-try:
-    from apex import amp
-    _amp_available = True
-except ImportError:
-    _amp_available = False
-amp_available = pytest.mark.skipif(not _amp_available,
-                                   reason="apex/amp is not installed")
-
-
-def test_lamb_fp32_grad_clip(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_lamb_fp32_grad_clip(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_lamb_fp32_grad_clip(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_lamb_fp16_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_lamb_fp16_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_lamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_lamb_fp16_empty_grad(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=True)
-
-    @distributed_test(world_size=[2])
-    def _test_lamb_fp16_empty_grad(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_lamb_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_adam_fp32_empty_grad(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": False
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=True)
-
-    @distributed_test(world_size=[2])
-    def _test_adam_fp32_empty_grad(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_fp32_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_adamw_fp16_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_adamw_fp16_basic(args, model, hidden_dim):
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_unfused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
-        torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
-        all_gather_results = [
-            torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-        ]
-        dist.all_gather(all_gather_results, torch_norm_tensor)
-        assert len(set([x.item() for x in all_gather_results])) == 1
-        return 1.0
-
-    @distributed_test(world_size=[2])
-    def _test_unfused_fp16_optimizer(args, hidden_dim):
-        # initialize MoE
-        model = SimpleMoEModel(hidden_dim, ep_size=2)
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        engine, optimizer, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
-        for n, batch in enumerate(data_loader):
-            loss = engine(batch[0], batch[1])
-            engine.backward(loss)
-            engine.step()
-
-    _test_unfused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
-
-
-def test_fused_fp16_optimizer_gradnorm_for_moe(tmpdir, monkeypatch):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True):
-        torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
-        all_gather_results = [
-            torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-        ]
-        dist.all_gather(all_gather_results, torch_norm_tensor)
-        assert len(set([x.item() for x in all_gather_results])) == 1
-        return 1.0
-
-    @distributed_test(world_size=[2])
-    def _test_fused_fp16_optimizer(args, hidden_dim):
-        # initialize MoE
-        model = SimpleMoEModel(hidden_dim, ep_size=2)
-        # optimizer = torch.optim.AdamW(params=model.parameters())
-        optimizer = FusedAdam(params=model.parameters())
-        engine, optimizer, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
-        for n, batch in enumerate(data_loader):
-            loss = engine(batch[0], batch[1])
-            engine.backward(loss)
-            engine.step()
-
-    _test_fused_fp16_optimizer(args=args, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
-def test_lamb_optimizer_gradnorm_for_moe(tmpdir, monkeypatch, fused_lamb_legacy: bool):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        },
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
-        torch_norm_tensor = torch.cuda.FloatTensor([total_norm])
-        all_gather_results = [
-            torch.zeros_like(torch_norm_tensor) for _ in range(dist.get_world_size())
-        ]
-        dist.all_gather(all_gather_results, torch_norm_tensor)
-        assert len(set([x.item() for x in all_gather_results])) == 1
-        return 1.0
-
-    @distributed_test(world_size=[2])
-    def _test_lamb_legacy_optimizer_step(args, hidden_dim, fused_lamb_legacy):
-        # initialize MoE
-        model = SimpleMoEModel(hidden_dim, ep_size=2)
-        engine, optimizer, _, _ = deepspeed.initialize(args=args,
-                                               model=model,
-                                               model_parameters=model.parameters(),
-                                               dist_init_required=False)
-        monkeypatch.setattr(optimizer,
-                            'unscale_and_clip_grads',
-                            mock_unscale_and_clip_grads)
-        optimizer.fused_lamb_legacy = fused_lamb_legacy
-        data_loader = sequence_dataloader(model=engine,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=engine.device)
-        for n, batch in enumerate(data_loader):
-            loss = engine(batch[0], batch[1])
-            engine.backward(loss)
-            engine.step()
-
-    _test_lamb_legacy_optimizer_step(args=args,
-                                     hidden_dim=hidden_dim,
-                                     fused_lamb_legacy=fused_lamb_legacy)
-
-
-def test_dict_config_adamw_fp16_basic():
-    config = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
-    args = create_deepspeed_args()
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_adamw_fp16_basic(args, model, hidden_dim, config):
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              config=config)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adamw_fp16_basic(args=args, model=model, hidden_dim=hidden_dim, config=config)
-
-
-def test_adamw_fp16_empty_grad(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_adamw_fp16_empty_grad(args, model, hidden_dim):
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adamw_fp16_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
-def test_adam_fp16_zero_onecycle_compatibility(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "scheduler": {
-            "type": "OneCycle",
-            "params": {
-                "cycle_first_step_size": 16000,
-                "cycle_first_stair_count": 8000,
-                "decay_step_size": 16000,
-                "cycle_min_lr": 1e-06,
-                "cycle_max_lr": 3e-05,
-                "decay_lr_rate": 1e-07,
-                "cycle_min_mom": 0.85,
-                "cycle_max_mom": 0.99,
-                "decay_mom_rate": 0.0
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim):
-        model = SimpleModel(hidden_dim)
-
-        model, _, _,_ = deepspeed.initialize(args=args,
-                                             model=model,
-                                             model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_fp16_zero_onecycle_compatibility(args=args,
-                                                zero_stage=zero_stage,
-                                                hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
-def test_zero_static_scale(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 4,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 138.
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=2)
-    def _test_zero_static_scale(args, zero_stage, hidden_dim):
-        #making hidden size not divisible by DP for covering this scenario
-        hidden_dim = hidden_dim
-        model = SimpleModel(hidden_dim)
-
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                            model=model,
-                                            model_parameters=model.parameters())
-
-        # Ensure the static scaler is configured.
-        assert optim.dynamic_loss_scale == False
-        assert optim.loss_scaler.loss_scale == 138.
-
-        # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    #test when hidden_dim is not aligned with world size
-    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=9)
-    #test when hidden_dim is aligned with world size
-    _test_zero_static_scale(args=args, zero_stage=zero_stage, hidden_dim=10)
-
-
-def test_zero_static_scale_deprecated_format(tmpdir):
-    config_dict = {
-        "train_batch_size": 4,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 138.
-        },
-        "zero_optimization": {
-            "stage": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=2)
-    def _test_zero_static_scale(args):
-        hidden_dim = 10
-        model = SimpleModel(hidden_dim)
-        model, optim, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        # Ensure the static scaler is configured.
-        assert optim.dynamic_loss_scale == False
-        assert optim.loss_scaler.loss_scale == 138.
-
-        # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zero_static_scale(args)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
-def test_zero_allow_untested_optimizer(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_batch_size": 4,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True,
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload
-        },
-        "zero_allow_untested_optimizer": False
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[1])
-    def _test_zero_allow_untested_optimizer(args, zero_stage):
-        hidden_dim = 10
-        model = SimpleModel(hidden_dim)
-        optimizer = SimpleOptimizer(model.parameters())
-        with pytest.raises(AssertionError):
-            model, optim, _, _ = deepspeed.initialize(args=args,
-                                                      model=model,
-                                                      optimizer=optimizer,
-                                                      model_parameters=model.parameters())
-
-    _test_zero_allow_untested_optimizer(args, zero_stage)
-
-
-@pytest.mark.parametrize('zero_stage, use_cpu_offload',
-                         [(1,
-                           False),
-                          (2,
-                           False),
-                          (2,
-                           True),
-                          (3,
-                           False),
-                          (3,
-                           True)])
-def test_zero_empty_partition(tmpdir, zero_stage, use_cpu_offload):
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    if zero_stage == 3:
-        pytest.skip("skip for now")
-
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "zero_optimization": {
-            "stage": zero_stage,
-            "cpu_offload": use_cpu_offload,
-            "reduce_bucket_size": 100,
-            "allgather_bucket_size": 100
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    @distributed_test(world_size=[3])
-    def _test_zero_empty_partition(args, zero_stage):
-        hidden_dim = 1
-        model = SimpleModel(hidden_dim)
-
-        # Ensure model has 2 parameters, to cause empty partition with DP=3
-        assert len(list(model.parameters())) == 2
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        # Now make sure things work..
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zero_empty_partition(args=args, zero_stage=zero_stage)
-
-
-@amp_available
-def test_adam_amp_basic(tmpdir):
-    config_dict = {"train_batch_size": 1, "steps_per_print": 1, "amp": {"enabled": True}}
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_adam_amp_basic(args, model, hidden_dim):
-        optimizer = torch.optim.Adam(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@amp_available
-def test_lamb_amp_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Lamb",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "amp": {
-            "enabled": True,
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_lamb_amp_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_lamb_amp_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@amp_available
-def test_adam_amp_o2(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "amp": {
-            "enabled": True,
-            "opt_level": "O2"
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_adam_amp_o2(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_amp_o2(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@amp_available
-def test_adam_amp_o2_empty_grad(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "amp": {
-            "enabled": True,
-            "opt_level": "O2"
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _test_adam_amp_o2_empty_grad(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_adam_amp_o2_empty_grad(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('zero_stage, optimizer_constructor',
-                         [(1,
-                           FusedAdam),
-                          (2,
-                           torch.optim.Adam),
-                          (2,
-                           FusedAdam),
-                          (3,
-                           torch.optim.Adam),
-                          (3,
-                           FusedAdam)])
-def test_zero_supported_client_optimizer(tmpdir, zero_stage, optimizer_constructor):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": zero_stage
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor):
-        model = SimpleModel(hidden_dim)
-
-        client_optimizer = optimizer_constructor(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=client_optimizer)
-
-    _test_zero_supported_client_optimizer(args=args,
-                                          zero_stage=zero_stage,
-                                          optimizer_constructor=optimizer_constructor)
-
-
-def test_zero2_reduce_scatter_off(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "gradient_clipping": 1.0,
-        "zero_optimization": {
-            "stage": 2,
-            "contiguous_gradients": True,
-            "allgather_bucket_size": 2000000000,
-            "reduce_bucket_size": 200000000,
-            "overlap_comm": False,
-            "reduce_scatter": False
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _helper(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _helper(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('adam_type, torch_impl',
-                         [('Adam',
-                           True),
-                          ('Adam',
-                           False),
-                          ('AdamW',
-                           True),
-                          ('AdamW',
-                           False)])
-def test_fp16_adam_types(tmpdir, adam_type, torch_impl):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 10
-        },
-        "optimizer": {
-            "type": adam_type,
-            "torch_adam": torch_impl,
-            "params": {
-                "lr": 0.00015
-            }
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_fp16_adam_types(args, model, hidden_dim):
-
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for _, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_fp16_adam_types(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_zero3_lazyscatter(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 10
-        },
-        "optimizer": {
-            "type": "AdamW",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "zero_optimization": {
-            "stage": 3
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    @distributed_test(world_size=[1])
-    def _go(args):
-        model = SimpleModel(hidden_dim)
-
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for _, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _go(args=args)
-
-
-@pytest.mark.parametrize('stage', [1, 2, 3])
-def test_zero_empty_grad(tmpdir, stage):
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": stage
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _go(args, model, hidden_dim):
-        optimizer = torch.optim.Adam(model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _go(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_get_optim_files.py b/tests/unit/test_get_optim_files.py
deleted file mode 100644
index 68d046bfe99ebcdf5c84ef1e8c4056e2deca3e52..0000000000000000000000000000000000000000
--- a/tests/unit/test_get_optim_files.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import os
-import pytest
-import deepspeed
-from deepspeed.utils.zero_to_fp32 import get_optim_files
-
-
-@pytest.mark.parametrize('num_checkpoints', [1, 2, 12, 24])
-def test_get_optim_files(tmpdir, num_checkpoints):
-    saved_files = []
-    for i in range(num_checkpoints):
-        file_name = "zero_" + str(i) + "_optim_states.pt"
-        path_name = os.path.join(tmpdir, file_name)
-        saved_files.append(path_name)
-        with open(path_name, "w") as f:
-            f.write(file_name)
-    loaded_files = get_optim_files(tmpdir)
-    for lf, sf in zip(loaded_files, saved_files):
-        assert lf == sf
diff --git a/tests/unit/test_groups.py b/tests/unit/test_groups.py
deleted file mode 100644
index 2769da74436d882fb272850eee1936cc7f1b36f3..0000000000000000000000000000000000000000
--- a/tests/unit/test_groups.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import unittest
-
-from deepspeed.utils.groups import _get_expert_parallel_ranks
-
-
-class TestGroups(unittest.TestCase):
-    def test_get_expert_parallel_ranks(self):
-        """
-            Example - E + M + D parallel
-            world_size = 16
-            model_degree = 2
-            expert_degree = 4 # number of experts in same group
-            mp_group = [0, 1], [2,3], [4,5] ...
-            data_parallel_group =[0,2,4,6,8,10, 12,14],                 [1,3,5,7,9,11,13,15]
-            expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
-            expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
-        """
-        expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
-            world_size=16, model_parallel_size_=2, expert_parallel_size_=4)
-        self.assertEqual(expert_parallel_groups,
-                         [[0,
-                           2,
-                           4,
-                           6],
-                          [8,
-                           10,
-                           12,
-                           14],
-                          [1,
-                           3,
-                           5,
-                           7],
-                          [9,
-                           11,
-                           13,
-                           15]])
-        self.assertEqual(expert_data_parallel_groups,
-                         [[0,
-                           8],
-                          [2,
-                           10],
-                          [4,
-                           12],
-                          [6,
-                           14],
-                          [1,
-                           9],
-                          [3,
-                           11],
-                          [5,
-                           13],
-                          [7,
-                           15]])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/unit/test_ignore_unused_parameters.py b/tests/unit/test_ignore_unused_parameters.py
deleted file mode 100644
index eb26f46ca2096bf100ef7ac62dcb7177ad1deb30..0000000000000000000000000000000000000000
--- a/tests/unit/test_ignore_unused_parameters.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch
-import pytest
-import json
-import argparse
-import os
-from .common import distributed_test
-from .simple_model import UnusedParametersModel, random_dataloader, args_from_dict
-from deepspeed.ops.op_builder import CPUAdamBuilder
-
-import deepspeed
-
-
-@pytest.mark.parametrize('ignore_unused_parameters', [False, True])
-def test_stage2_ignore_unused_parameters(tmpdir, ignore_unused_parameters):
-    use_cpu_offload = True
-
-    if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
-        pytest.skip("cpu-adam is not compatible")
-
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": 2,
-            "cpu_offload": use_cpu_offload,
-            "ignore_unused_parameters": ignore_unused_parameters
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 4
-
-    model = UnusedParametersModel(hidden_dim=hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_stage2_ignore_unused_parameters(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        def _loop():
-            for n, batch in enumerate(data_loader):
-                loss = model(batch[0], batch[1])
-                model.backward(loss)
-                model.step()
-
-        if ignore_unused_parameters:
-            _loop()
-        else:
-            with pytest.raises(AssertionError) as e:
-                _loop()
-            assert e.value.args and 'ignore_unused_parameters' in e.value.args[0]
-
-    _test_stage2_ignore_unused_parameters(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_lr_schedulers.py b/tests/unit/test_lr_schedulers.py
deleted file mode 100644
index 47bcfb1ef3293b56e0d046015f60edf701ea24e3..0000000000000000000000000000000000000000
--- a/tests/unit/test_lr_schedulers.py
+++ /dev/null
@@ -1,571 +0,0 @@
-import torch
-import deepspeed
-import argparse
-import pytest
-import json
-import os
-from .common import distributed_test
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-from deepspeed.runtime.lr_schedules import LR_RANGE_TEST, LR_RANGE_TEST_MIN_LR, LR_RANGE_TEST_STEP_RATE, LR_RANGE_TEST_STEP_SIZE, LR_RANGE_TEST_STAIRCASE
-from deepspeed.runtime.lr_schedules import WARMUP_LR, WARMUP_MIN_LR, WARMUP_MAX_LR, WARMUP_NUM_STEPS, WARMUP_TYPE, WARMUP_LOG_RATE, WARMUP_LINEAR_RATE
-from deepspeed.runtime.lr_schedules import ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR, CYCLE_FIRST_STEP_SIZE, DECAY_LR_RATE, DECAY_STEP_SIZE
-from deepspeed.runtime.lr_schedules import CYCLE_MIN_MOM, CYCLE_MAX_MOM, DECAY_MOM_RATE
-from deepspeed.runtime.lr_schedules import WARMUP_DECAY_LR, TOTAL_NUM_STEPS
-
-
-def _verify_continuous_decrease(values):
-    for i in range(len(values) - 1):
-        assert values[i] > values[i + 1]
-
-
-def _verify_continuous_increase(values):
-    for i in range(len(values) - 1):
-        assert values[i] < values[i + 1]
-
-
-def _verify_staircase_increase(values, step_size):
-    num_values = len(values)
-    for i in range(0, num_values, step_size):
-        j = min(i + step_size, num_values)
-        assert all([values[i] == v for v in values[i:j]])
-
-
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 10,
-                               TOTAL_NUM_STEPS: 20
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1
-                           }),
-                          (LR_RANGE_TEST,
-                           {})])
-def test_get_lr_before_train(tmpdir, scheduler_type, params):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": scheduler_type,
-            "params": params
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_get_lr_before_train(args, model, hidden_dim):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            # get lr before training starts
-            lr_scheduler.get_lr()
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_get_lr_before_train(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize("warmup_num_steps, warmup_type",
-                         [
-                             (10,
-                              WARMUP_LOG_RATE),
-                             (15,
-                              WARMUP_LOG_RATE),
-                             (19,
-                              WARMUP_LOG_RATE),
-                             (33,
-                              WARMUP_LOG_RATE),
-                             (10,
-                              WARMUP_LINEAR_RATE),
-                             (15,
-                              WARMUP_LINEAR_RATE),
-                             (19,
-                              WARMUP_LINEAR_RATE),
-                             (33,
-                              WARMUP_LINEAR_RATE),
-                         ])
-def test_lr_warmup_schedule(tmpdir, warmup_num_steps, warmup_type):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": WARMUP_LR,
-            "params": {
-                WARMUP_MIN_LR: 0.1,
-                WARMUP_MAX_LR: 0.2,
-                WARMUP_NUM_STEPS: warmup_num_steps,
-                WARMUP_TYPE: warmup_type,
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-
-    total_num_steps = 2 * warmup_num_steps
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_lr_warmup_schedule(args, model, hidden_dim, schedule_params, num_steps):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=num_steps * 2,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        step_lrs = []
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            step_lrs.append(lr_scheduler.get_lr())
-
-        # Verify initial lr
-        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
-
-        # Verify warmup completion
-        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
-        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
-        assert step_lrs[warmup_num_steps] == warmup_max_lr
-
-        # Verify post-warmup completion
-        assert all([warmup_max_lr == lr for lr in step_lrs[warmup_num_steps:]])
-
-    _test_lr_warmup_schedule(args=args,
-                             model=model,
-                             hidden_dim=hidden_dim,
-                             schedule_params=config_dict["scheduler"]["params"],
-                             num_steps=total_num_steps)
-
-
-@pytest.mark.parametrize("warmup_num_steps, warmup_type",
-                         [
-                             (10,
-                              WARMUP_LOG_RATE),
-                             (15,
-                              WARMUP_LOG_RATE),
-                             (19,
-                              WARMUP_LOG_RATE),
-                             (33,
-                              WARMUP_LOG_RATE),
-                             (10,
-                              WARMUP_LINEAR_RATE),
-                             (15,
-                              WARMUP_LINEAR_RATE),
-                             (19,
-                              WARMUP_LINEAR_RATE),
-                             (33,
-                              WARMUP_LINEAR_RATE),
-                         ])
-def test_lr_warmup_decay_schedule(tmpdir, warmup_num_steps, warmup_type):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": WARMUP_DECAY_LR,
-            "params": {
-                WARMUP_MIN_LR: 0.1,
-                WARMUP_MAX_LR: 0.2,
-                WARMUP_NUM_STEPS: warmup_num_steps,
-                TOTAL_NUM_STEPS: warmup_num_steps * 2,
-                WARMUP_TYPE: warmup_type
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_lr_warmup_decay_schedule(args,
-                                       model,
-                                       hidden_dim,
-                                       schedule_params,
-                                       num_steps):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=num_steps * 2,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        step_lrs = []
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            step_lrs.append(lr_scheduler.get_lr())
-
-        # Verify initial lr
-        assert step_lrs[0] == [schedule_params[WARMUP_MIN_LR]]
-
-        # Verify lr at warmup completion
-        warmup_num_steps = schedule_params[WARMUP_NUM_STEPS]
-        warmup_max_lr = [schedule_params[WARMUP_MAX_LR]]
-        assert step_lrs[warmup_num_steps] == warmup_max_lr
-
-        # Verify decay phase
-        previous_lr = warmup_max_lr
-        for lr in step_lrs[warmup_num_steps + 1:]:
-            assert lr < previous_lr
-            previous_lr = lr
-
-    schedule_params = config_dict["scheduler"]["params"]
-
-    total_num_steps = schedule_params[TOTAL_NUM_STEPS]
-
-    _test_lr_warmup_decay_schedule(args=args,
-                                   model=model,
-                                   hidden_dim=hidden_dim,
-                                   schedule_params=schedule_params,
-                                   num_steps=total_num_steps)
-
-
-@pytest.mark.parametrize("scheduler_type,params",
-                         [(WARMUP_LR,
-                           {}),
-                          (WARMUP_DECAY_LR,
-                           {
-                               WARMUP_NUM_STEPS: 5,
-                               TOTAL_NUM_STEPS: 10
-                           }),
-                          (ONE_CYCLE,
-                           {
-                               CYCLE_MIN_LR: 0,
-                               CYCLE_MAX_LR: 0.1,
-                               CYCLE_FIRST_STEP_SIZE: 5,
-                               DECAY_STEP_SIZE: 5
-                           }),
-                          (LR_RANGE_TEST,
-                           {
-                               LR_RANGE_TEST_MIN_LR: 1e-4,
-                               LR_RANGE_TEST_STEP_SIZE: 1
-                           })])
-def test_scheduler_optimizer_parity(tmpdir, scheduler_type, params):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": scheduler_type,
-            "params": params
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_scheduler_optimizer_parity(args, model, hidden_dim):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-            assert lr_scheduler.get_lr() == model.get_lr()
-
-    _test_scheduler_optimizer_parity(args=args, model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize("min_lr, step_rate, step_size, staircase",
-                         [(1e-4, 1e-5, 1, True),
-                          (1e-5, 1e-5, 1, False),
-                          (1e-4, 1e-3, 10, True),
-                          (1e-3, 1e-3, 10, False),
-                          (1e-2, 1e-2, 19, True),
-                          (1e-2, 1e-2, 19, False)
-                           ])# yapf: disable
-def test_lr_range_test(tmpdir, min_lr, step_rate, step_size, staircase):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": LR_RANGE_TEST,
-            "params": {
-                LR_RANGE_TEST_MIN_LR: min_lr,
-                LR_RANGE_TEST_STEP_RATE: step_rate,
-                LR_RANGE_TEST_STEP_SIZE: step_size,
-                LR_RANGE_TEST_STAIRCASE: staircase
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_lr_range_test(args, model, hidden_dim, min_lr, step_size, staircase):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 2),
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-
-        step_lrs = []
-        for _, batch in enumerate(data_loader):
-            step_lrs.append(lr_scheduler.get_lr())
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-        # Verify starting lr
-        assert step_lrs[0] == min_lr
-
-        if staircase:
-            # Verify staircase increasing lr
-            _verify_staircase_increase(step_lrs, step_size)
-        else:
-            # Verify continuous increasing lr
-            _verify_continuous_increase(step_lrs)
-
-    _test_lr_range_test(args=args,
-                        model=model,
-                        hidden_dim=hidden_dim,
-                        min_lr=[min_lr],
-                        step_size=step_size,
-                        staircase=staircase)
-
-
-@pytest.mark.parametrize("min_lr, max_lr, decay_rate, cycle_step_size, decay_step_size",
-                         [
-                             (1e-5, 1e-2, 1e-3, 10, 10),
-                             (1e-3, 1e-1, 0, 21, 21),
-                             (1e-5, 1e-2, 1e-3, 10, 10),
-                             (1e-3, 1e-1, 1e-1, 21, 21),
-                             (1e-5, 1e-1, 0, 10, 0),
-                         ])  # yapf: disable
-def test_onecycle_lr(tmpdir,
-                     min_lr,
-                     max_lr,
-                     decay_rate,
-                     cycle_step_size,
-                     decay_step_size):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": ONE_CYCLE,
-            "params": {
-                CYCLE_MIN_LR: min_lr,
-                CYCLE_MAX_LR: max_lr,
-                DECAY_LR_RATE: decay_rate,
-                CYCLE_FIRST_STEP_SIZE: cycle_step_size,
-                DECAY_STEP_SIZE: decay_step_size
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_onecycle_lr(args,
-                          model,
-                          hidden_dim,
-                          min_lr,
-                          max_lr,
-                          step_size,
-                          decay_rate):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 3),
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-
-        step_lrs = []
-        for _, batch in enumerate(data_loader):
-            step_lrs.append(lr_scheduler.get_lr())
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-        # Verify starting lr
-        assert step_lrs[0] == min_lr
-
-        # Verify peak lr
-        assert step_lrs[step_size] == max_lr
-
-        # Verify increasing phase
-        _verify_continuous_increase(step_lrs[:step_size])
-
-        # Verify decreasing phase
-        _verify_continuous_decrease(step_lrs[step_size:(step_size * 2)])
-
-        # Verify decay phase
-        if decay_rate > 0:
-            _verify_continuous_decrease(step_lrs[(step_size * 2):])
-
-    _test_onecycle_lr(args=args,
-                      model=model,
-                      hidden_dim=hidden_dim,
-                      min_lr=[min_lr],
-                      max_lr=[max_lr],
-                      step_size=cycle_step_size,
-                      decay_rate=decay_rate)
-
-
-@pytest.mark.parametrize("min_mom, max_mom, decay_rate, step_size",
-                         [
-                             (0.08, 0.09, 1e-3, 10),
-                             (0.08, 0.09, 0, 21),
-                             (0.08, 0.09, 1e-3, 10),
-                             (0.08, 0.09, 0, 21),
-                         ]) # yapf: disable
-def test_onecycle_mom(tmpdir, min_mom, max_mom, decay_rate, step_size):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            },
-        },
-        "scheduler": {
-            "type": ONE_CYCLE,
-            "params": {
-                CYCLE_MIN_LR: 1e-3,
-                CYCLE_MAX_LR: 1e-2,
-                CYCLE_MIN_MOM: min_mom,
-                CYCLE_MAX_MOM: max_mom,
-                DECAY_MOM_RATE: decay_rate,
-                CYCLE_FIRST_STEP_SIZE: step_size,
-                DECAY_STEP_SIZE: step_size
-            }
-        },
-        "gradient_clipping": 1.0
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_onecycle_mom(args,
-                           model,
-                           hidden_dim,
-                           min_mom,
-                           max_mom,
-                           step_size,
-                           decay_rate):
-        model, _, _, lr_scheduler = deepspeed.initialize(args=args,
-                                                         model=model,
-                                                         model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=max(50,
-                                                          step_size * 3),
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-
-        step_moms = []
-        for _, batch in enumerate(data_loader):
-            step_moms.append(lr_scheduler.get_mom())
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-        # Verify starting lr
-        assert step_moms[0][0][0] == max_mom
-
-        # Verify peak lr
-        assert step_moms[step_size][0][0] == min_mom
-
-        # Verify decreasing phase
-        _verify_continuous_decrease(step_moms[:step_size])
-
-        # Verify increasing phase
-        _verify_continuous_increase(step_moms[step_size:(step_size * 2)])
-
-        # Verify decay phase
-        if decay_rate > 0:
-            _verify_continuous_increase(step_moms[(step_size * 2):])
-
-    _test_onecycle_mom(args=args,
-                       model=model,
-                       hidden_dim=hidden_dim,
-                       min_mom=min_mom,
-                       max_mom=max_mom,
-                       step_size=step_size,
-                       decay_rate=decay_rate)
diff --git a/tests/unit/test_moe.py b/tests/unit/test_moe.py
deleted file mode 100644
index e10356902a686178148f6dc245d4da7ea7ad71d2..0000000000000000000000000000000000000000
--- a/tests/unit/test_moe.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import math
-from deepspeed.utils import groups
-import torch
-import torch.distributed as dist
-import deepspeed
-import argparse
-import pytest
-import json
-import os
-from deepspeed.ops.adam import FusedAdam
-from .common import distributed_test
-from deepspeed.ops.op_builder import CPUAdamBuilder
-from .simple_model import SimpleModel, SimplePRMoEModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args, SimpleMoEModel, sequence_dataloader
-from .util import required_torch_version
-
-try:
-    from apex import amp
-    _amp_available = True
-except ImportError:
-    _amp_available = False
-amp_available = pytest.mark.skip(_amp_available, reason="apex/amp is not installed")
-
-
-@pytest.mark.parametrize("ep_size, use_residual",
-                         [(2,
-                           True),
-                          (2,
-                           False),
-                          (4,
-                           True),
-                          (4,
-                           False)])
-def test_moe(tmpdir, ep_size, use_residual):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 8,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 16
-
-    @distributed_test(world_size=[4])
-    def _test_moe(args, hidden_dim, ep_size, use_residual):
-        # E+D -- ep_size = 2
-        # E only -- ep_size = 4
-        model = SimpleMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual)
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-        #dist_init_required=False -- parameterize to True/False?
-
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_moe(args=args,
-              hidden_dim=hidden_dim,
-              ep_size=ep_size,
-              use_residual=use_residual)
-
-
-@pytest.mark.parametrize("ep_size, use_residual", [(2, True), (2, False)])
-def test_pr_moe(tmpdir, ep_size, use_residual):
-    if not required_torch_version():
-        pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
-
-    config_dict = {
-        "train_batch_size": 8,
-        "steps_per_print": 1,
-        "fp16": {
-            "enabled": True
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 16
-
-    @distributed_test(world_size=[4])
-    def _test_moe(args, hidden_dim, ep_size, use_residual):
-        # E+D -- ep_size = 2
-        # E only -- ep_size = 4
-
-        model = SimplePRMoEModel(hidden_dim, ep_size=ep_size, use_residual=use_residual)
-        optimizer = torch.optim.AdamW(params=model.parameters())
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              optimizer=optimizer,
-                                              dist_init_required=False)
-
-        data_loader = sequence_dataloader(model=model,
-                                          total_samples=50,
-                                          hidden_dim=hidden_dim,
-                                          device=model.device)
-
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_moe(args=args,
-              hidden_dim=hidden_dim,
-              ep_size=ep_size,
-              use_residual=use_residual)
diff --git a/tests/unit/test_multi_output_model.py b/tests/unit/test_multi_output_model.py
deleted file mode 100644
index 478bdc8d383d3cf44daf483c6124667f4f934268..0000000000000000000000000000000000000000
--- a/tests/unit/test_multi_output_model.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import torch
-import deepspeed
-import argparse
-import pytest
-from pytest import approx
-import json
-import os
-from .common import distributed_test
-from .simple_model import args_from_dict
-from .multi_output_model import MultiOutputModel, multi_output_dataloader
-
-
-def create_config_dict(micro_batch_size, grad_accumulation_steps, world_size):
-    return {
-        "train_micro_batch_size_per_gpu": micro_batch_size,
-        "gradient_accumulation_steps": grad_accumulation_steps,
-        "train_batch_size": micro_batch_size * grad_accumulation_steps * world_size,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.00015
-            }
-        },
-        "fp16": {
-            "enabled": True
-        }
-    }
-
-
-def test_two_output_model(tmpdir):
-    gradient_accumulation_steps = 2
-    micro_batch_size = 1
-    world_size = 1
-    config_dict = create_config_dict(micro_batch_size,
-                                     gradient_accumulation_steps,
-                                     world_size)
-
-    hidden_dim = 10
-    weight_value = 0.1
-    args = args_from_dict(tmpdir, config_dict)
-
-    model = MultiOutputModel(hidden_dim, weight_value)
-
-    @distributed_test(world_size=[1])
-    def _test_two_output_model(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        total_samples = 4
-        data_loader = multi_output_dataloader(model=model,
-                                              total_samples=total_samples,
-                                              hidden_dim=hidden_dim,
-                                              device=model.device,
-                                              inputs=[1.0,
-                                                      2.0],
-                                              targets=[1,
-                                                       2])
-        for n, batch in enumerate(data_loader):
-            assert len(batch) % 2 == 0, \
-                 f"multi_output_dataloader failed to return even number of data samples (input+target)"
-
-            midpoint = len(batch) // 2
-            inputs, targets = batch[:midpoint], batch[midpoint:]
-            loss_tuple = model(inputs, targets)
-
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
-            for loss in loss_tuple:
-                assert loss.shape == torch.Size([])
-                assert loss.item() == approx(expected_loss.item())
-
-            summed_loss = sum(loss_tuple)
-            scaled_loss = model.backward(summed_loss)
-            expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps
-            assert scaled_loss.item() == approx(expected_scaled_loss.item())
-
-            model.step()
-
-    _test_two_output_model(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_three_output_model(tmpdir):
-    gradient_accumulation_steps = 3
-    micro_batch_size = 1
-    world_size = 1
-    config_dict = create_config_dict(micro_batch_size,
-                                     gradient_accumulation_steps,
-                                     world_size)
-
-    hidden_dim = 10
-    weight_value = 0.1
-    args = args_from_dict(tmpdir, config_dict)
-
-    model = MultiOutputModel(hidden_dim, weight_value)
-
-    @distributed_test(world_size=[1])
-    def _test_three_output_model(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        total_samples = gradient_accumulation_steps * micro_batch_size * 2
-        data_loader = multi_output_dataloader(model=model,
-                                              total_samples=total_samples,
-                                              hidden_dim=hidden_dim,
-                                              device=model.device,
-                                              inputs=[1.0,
-                                                      2.0,
-                                                      3.0],
-                                              targets=[1,
-                                                       2,
-                                                       3])
-        for n, batch in enumerate(data_loader):
-            assert len(batch) % 2 == 0, \
-                 f"multi_output_dataloader failed to return even number of data samples (input+target)"
-
-            midpoint = len(batch) // 2
-            inputs, targets = batch[:midpoint], batch[midpoint:]
-            loss_tuple = model(inputs, targets)
-            assert len(loss_tuple) == 3
-
-            expected_loss = torch.tensor(2.302734375,
-                                         dtype=torch.half,
-                                         device=model.device)
-
-            for loss in loss_tuple:
-                assert loss.shape == torch.Size([])
-                assert loss.item() == approx(expected_loss.item())
-
-            summed_loss = sum(loss_tuple)
-            scaled_loss = model.backward(summed_loss)
-            expected_scaled_loss = summed_loss.float() / gradient_accumulation_steps
-            assert scaled_loss.item() == approx(expected_scaled_loss.item())
-
-            model.step()
-
-    _test_three_output_model(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_onebit.py b/tests/unit/test_onebit.py
deleted file mode 100644
index bfcbdceb0ba76c07168f18f0ca3c5fb9c2b7b3da..0000000000000000000000000000000000000000
--- a/tests/unit/test_onebit.py
+++ /dev/null
@@ -1,1335 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-import deepspeed
-import argparse
-import pytest
-import copy
-import json
-import os
-import numpy as np
-import time
-
-from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
-from deepspeed.ops.op_builder import OpBuilder
-
-PipeTopo = PipeDataParallelTopology
-from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
-from .common import distributed_test
-from .simple_model import SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict, create_deepspeed_args
-from .test_pipe import AlexNetPipe, train_cifar
-
-TORCH_MAJOR = int(torch.__version__.split('.')[0])
-TORCH_MINOR = int(torch.__version__.split('.')[1])
-if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
-    pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
-                allow_module_level=True)
-
-rocm_version = OpBuilder.installed_rocm_version()
-if rocm_version[0] > 4:
-    pytest.skip(
-        "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
-        allow_module_level=True)
-
-
-def test_onebitadam_fp16_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_onebitadam_fp16_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_onebitadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitadam_fp32_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_onebitadam_fp32_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_onebitadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitadam_exp_avg_mask(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-    mask1 = torch.flatten(mask1)
-    optimizer_grouped_parameters = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                    {
-                                        'params': [param_optimizer[1][1]],
-                                        'weight_decay': 0.01
-                                    }]
-
-    @distributed_test(world_size=[2])
-    def _test_onebitadam_exp_avg_mask(args, model, hidden_dim):
-        model, optimizer, _, _ = deepspeed.initialize(args=args,
-                                                      model=model,
-                                                      model_parameters=optimizer_grouped_parameters)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-        # Test whether the momentum mask works
-        for v in optimizer.state.values():
-            if v['exp_avg'].size() == mask1.size():
-                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
-
-    _test_onebitadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitadam_checkpointing(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    mask2 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-        mask2[1][col] += 1
-    mask1 = torch.flatten(mask1)
-    mask2 = torch.flatten(mask2)
-
-    optimizer_grouped_parameters_1 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_2 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask2
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_3 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    @distributed_test(world_size=[2])
-    def _test_onebitadam_checkpointing(mask1, mask2, args, model, hidden_dim):
-        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_1)
-        data_loader = random_dataloader(model=model_1,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model_1.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_1(batch[0], batch[1])
-            model_1.backward(loss)
-            model_1.step()
-        # Test whether momentum mask still exist after saving checkpoint
-        assert optimizer_1.optimizer.adam_freeze_key is True
-        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
-
-
-        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_2)
-        # Test whether momentum mask stays the same after loading checkpoint
-        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
-        model_2.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        for v in optimizer_2.state.values():
-            assert 'worker_error' not in v, f"Incorrect worker error"
-            assert 'server_error' not in v, f"Incorrect server error"
-        assert optimizer_2.optimizer.adam_freeze_key is True
-
-        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_3)
-        optimizer_3.optimizer.freeze_step = 20
-        data_loader = random_dataloader(model=model_3,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model_3.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_3(batch[0], batch[1])
-            model_3.backward(loss)
-            model_3.step()
-        assert optimizer_3.optimizer.adam_freeze_key is True
-        # Test whether momentum mask stays the same after loading checkpoint
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
-        model_3.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        for v in optimizer_3.state.values():
-            assert 'worker_error' not in v, f"Incorrect worker error"
-            assert 'server_error' not in v, f"Incorrect server error"
-        assert optimizer_3.optimizer.adam_freeze_key is False
-
-    _test_onebitadam_checkpointing(mask1,
-                                   mask2,
-                                   args=args,
-                                   model=model,
-                                   hidden_dim=hidden_dim)
-
-
-def test_onebitadam_checkpointing_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _test_onebitadam_checkpointing_overflow(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            if dist.get_rank() == 0 and n >= 10:
-                loss = loss * 1000000.0
-            model.backward(loss)
-            dist.barrier()
-            model.step()
-            dist.barrier()
-            model.save_checkpoint(save_folder, tag=None)
-
-    _test_onebitadam_checkpointing_overflow(args=args,
-                                            model=model,
-                                            hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('topo',
-                         [
-                             PipeTopo(num_pp=1,
-                                      num_dp=4),
-                             PipeTopo(num_pp=2,
-                                      num_dp=2),
-                             PipeTopo(num_pp=4,
-                                      num_dp=1),
-                         ])
-def test_onebitadam_fp16_pipeline(topo, tmpdir):
-    config_dict = {
-        "train_batch_size": 16,
-        "train_micro_batch_size_per_gpu": 4,
-        "steps_per_print": 20,
-        "optimizer": {
-            "type": "OneBitAdam",
-            "params": {
-                "lr": 0.00001,
-                "betas": [0.9,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7,
-                "freeze_step": 200,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "zero_optimization": {
-            "stage": 0
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        },
-        "pipeline": {
-            "seed_layers": True,
-            "activation_checkpoint_interval": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    # Allocate model for consistent initial weights.
-    init_net = AlexNetPipe()
-
-    @distributed_test(world_size=4)
-    def _helper(topo, tmpdir, steps=500):
-        assert steps >= 100
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        test_losses = train_cifar(test_model,
-                                  args,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
-
-    _helper(topo, tmpdir)
-
-
-def test_zerooneadam_fp16_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_zerooneadam_fp16_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zerooneadam_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_zerooneadam_fp32_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_zerooneadam_fp32_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zerooneadam_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_zerooneadam_exp_avg_mask(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-    mask1 = torch.flatten(mask1)
-    optimizer_grouped_parameters = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                    {
-                                        'params': [param_optimizer[1][1]],
-                                        'weight_decay': 0.01
-                                    }]
-
-    @distributed_test(world_size=[2])
-    def _test_zerooneadam_exp_avg_mask(args, model, hidden_dim):
-        model, optimizer, _, _ = deepspeed.initialize(args=args,
-                                                      model=model,
-                                                      model_parameters=optimizer_grouped_parameters)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-        # Test whether the momentum mask works
-        for v in optimizer.state.values():
-            if v['exp_avg'].size() == mask1.size():
-                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
-
-    _test_zerooneadam_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_zerooneadam_checkpointing(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    mask2 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-        mask2[1][col] += 1
-    mask1 = torch.flatten(mask1)
-    mask2 = torch.flatten(mask2)
-
-    optimizer_grouped_parameters_1 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_2 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask2
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_3 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    @distributed_test(world_size=[2])
-    def _test_zerooneadam_checkpointing(mask1, mask2, args, model, hidden_dim):
-        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_1)
-        data_loader = random_dataloader(model=model_1,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model_1.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_1(batch[0], batch[1])
-            model_1.backward(loss)
-            model_1.step()
-        # Test whether momentum mask still exist after saving checkpoint
-        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
-
-
-        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_2)
-        # Test whether momentum mask stays the same after loading checkpoint
-        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
-        model_2.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        for v in optimizer_2.state.values():
-            assert 'worker_error' not in v, f"Incorrect worker error"
-            assert 'server_error' not in v, f"Incorrect server error"
-
-        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_3)
-        optimizer_3.optimizer.freeze_step = 20
-        data_loader = random_dataloader(model=model_3,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model_3.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_3(batch[0], batch[1])
-            model_3.backward(loss)
-            model_3.step()
-        # Test whether momentum mask stays the same after loading checkpoint
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
-        model_3.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        for v in optimizer_3.state.values():
-            assert 'worker_error' not in v, f"Incorrect worker error"
-            assert 'server_error' not in v, f"Incorrect server error"
-
-    _test_zerooneadam_checkpointing(mask1,
-                                    mask2,
-                                    args=args,
-                                    model=model,
-                                    hidden_dim=hidden_dim)
-
-
-def test_zerooneadam_checkpointing_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _test_zerooneadam_checkpointing_overflow(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            if dist.get_rank() == 0 and n >= 10:
-                loss = loss * 1000000.0
-            model.backward(loss)
-            dist.barrier()
-            model.step()
-            dist.barrier()
-            model.save_checkpoint(save_folder, tag=None)
-
-    _test_zerooneadam_checkpointing_overflow(args=args,
-                                             model=model,
-                                             hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('topo',
-                         [
-                             PipeTopo(num_pp=1,
-                                      num_dp=4),
-                             PipeTopo(num_pp=2,
-                                      num_dp=2),
-                             PipeTopo(num_pp=4,
-                                      num_dp=1),
-                         ])
-def test_zerooneadam_fp16_pipeline(topo, tmpdir):
-    config_dict = {
-        "train_batch_size": 16,
-        "train_micro_batch_size_per_gpu": 4,
-        "steps_per_print": 20,
-        "optimizer": {
-            "type": "ZeroOneAdam",
-            "params": {
-                "lr": 0.00001,
-                "betas": [0.9,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7,
-                "var_freeze_step": 4,
-                "var_update_scaler": 1,
-                "local_step_scaler": 1,
-                "local_step_clipper": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "zero_optimization": {
-            "stage": 0
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        },
-        "pipeline": {
-            "seed_layers": True,
-            "activation_checkpoint_interval": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    # Allocate model for consistent initial weights.
-    init_net = AlexNetPipe()
-
-    @distributed_test(world_size=4)
-    def _helper(topo, tmpdir, steps=500):
-        assert steps >= 100
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        test_losses = train_cifar(test_model,
-                                  args,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
-
-    _helper(topo, tmpdir)
-
-
-def test_onebitlamb_fp16_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "max_coeff": 0.3,
-                "min_coeff": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl",
-                "coeff_beta": 0.9,
-                "factor_max": 1.0,
-                "factor_min": 0.5,
-                "factor_threshold": 0.1
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_onebitlamb_fp16_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_onebitlamb_fp16_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitlamb_fp32_basic(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "max_coeff": 0.3,
-                "min_coeff": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl",
-                "coeff_beta": 0.9,
-                "factor_max": 1.0,
-                "factor_min": 0.5,
-                "factor_threshold": 0.1
-            }
-        },
-        "gradient_clipping": 1.0,
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[1, 2])
-    def _test_onebitlamb_fp32_basic(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.float)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_onebitlamb_fp32_basic(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitlamb_exp_avg_mask(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "max_coeff": 0.3,
-                "min_coeff": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl",
-                "coeff_beta": 0.9,
-                "factor_max": 1.0,
-                "factor_min": 0.5,
-                "factor_threshold": 0.1
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-    optimizer_grouped_parameters = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                    {
-                                        'params': [param_optimizer[1][1]],
-                                        'weight_decay': 0.01
-                                    }]
-
-    @distributed_test(world_size=[2])
-    def _test_onebitlamb_exp_avg_mask(args, model, hidden_dim):
-        model, optimizer, _, _ = deepspeed.initialize(args=args,
-                                                      model=model,
-                                                      model_parameters=optimizer_grouped_parameters)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-        # Test whether the momentum mask works
-        for v in optimizer.state.values():
-            if v['exp_avg'].size() == mask1.size():
-                assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
-
-    _test_onebitlamb_exp_avg_mask(args=args, model=model, hidden_dim=hidden_dim)
-
-
-def test_onebitlamb_checkpointing(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "max_coeff": 0.3,
-                "min_coeff": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl",
-                "coeff_beta": 0.9,
-                "factor_max": 1.0,
-                "factor_min": 0.5,
-                "factor_threshold": 0.1
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-    param_optimizer = list(model.named_parameters())
-    mask1 = torch.zeros_like(param_optimizer[0][1].data)
-    mask2 = torch.zeros_like(param_optimizer[0][1].data)
-    for col in range(mask1.size()[1]):
-        mask1[0][col] += 1
-        mask2[1][col] += 1
-
-    optimizer_grouped_parameters_1 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask1
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_2 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01,
-        'exp_avg_mask': mask2
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    optimizer_grouped_parameters_3 = [{
-        'params': [param_optimizer[0][1]],
-        'weight_decay': 0.01
-    },
-                                      {
-                                          'params': [param_optimizer[1][1]],
-                                          'weight_decay': 0.01
-                                      }]
-
-    @distributed_test(world_size=[2])
-    def _test_onebitlamb_checkpointing(mask1, mask2, args, model, hidden_dim):
-        model_1, optimizer_1, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_1)
-        data_loader = random_dataloader(model=model_1,
-                                        total_samples=10,
-                                        hidden_dim=hidden_dim,
-                                        device=model_1.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_1(batch[0], batch[1])
-            model_1.backward(loss)
-            model_1.step()
-        # Test whether momentum mask still exist after saving checkpoint
-        assert optimizer_1.optimizer.lamb_freeze_key is True
-        mask1 = mask1.to(device=optimizer_1.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Incorrect momentum mask"
-        scaling_coeff_1 = []
-        for v in optimizer_1.state.values():
-            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
-            scaling_coeff_1.append(v['scaling_coeff'])
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        model_1.save_checkpoint(save_folder, tag=None)
-        assert torch.allclose(optimizer_1.param_groups[0]['exp_avg_mask'], mask1, atol=1e-07), f"Momentum mask should not change after saving checkpoint"
-
-
-        model_2, optimizer_2, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_2)
-        # Test whether momentum mask stays the same after loading checkpoint
-        mask2 = mask2.to(device=optimizer_2.param_groups[0]['exp_avg_mask'].device)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Incorrect momentum mask"
-        model_2.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert torch.allclose(optimizer_2.param_groups[0]['exp_avg_mask'], mask2, atol=1e-07), f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        assert len(optimizer_2.optimizer.worker_errors) == 0, f"Incorrect worker error"
-        assert len(optimizer_2.optimizer.server_errors) == 0, f"Incorrect server error"
-        # Test whether scaling_coeffs is loaded correctly
-        scaling_coeff_2 = []
-        for v in optimizer_2.state.values():
-            assert 'scaling_coeff' in v, f"Incorrect scaling_coeff"
-            scaling_coeff_2.append(v['scaling_coeff'])
-        assert list(sorted(scaling_coeff_2)) == list(sorted(scaling_coeff_1)), f"Incorrect scaling_coeffs"
-        assert optimizer_2.optimizer.lamb_freeze_key is True
-
-        model_3, optimizer_3, _, _ = deepspeed.initialize(args=args,
-                                                          model=model,
-                                                          model_parameters=optimizer_grouped_parameters_3)
-        optimizer_3.optimizer.freeze_step = 20
-        data_loader = random_dataloader(model=model_3,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model_3.device)
-        for n, batch in enumerate(data_loader):
-            loss = model_3(batch[0], batch[1])
-            model_3.backward(loss)
-            model_3.step()
-        assert optimizer_3.optimizer.lamb_freeze_key is True
-        # Test whether momentum mask stays the same after loading checkpoint
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Incorrect momentum mask"
-        model_3.load_checkpoint(save_folder,
-                                tag=None,
-                                load_optimizer_states=True,
-                                load_lr_scheduler_states=True)
-        assert 'exp_avg_mask' not in optimizer_3.param_groups[0], f"Momentum mask should not change after loading checkpoint"
-        # Test whether worker&server error is reset
-        assert len(optimizer_3.optimizer.worker_errors) == 0, f"Incorrect worker error"
-        assert len(optimizer_3.optimizer.server_errors) == 0, f"Incorrect server error"
-        # Test whether scaling_coeffs, lamb_coeff_freeze, last_factor are reset
-        for v in optimizer_3.state.values():
-            assert v['lamb_coeff_freeze'] == 0.0, f"Incorrect lamb_coeff_freeze"
-            assert v['last_factor'] == 1.0, f"Incorrect last_factor"
-            assert 'scaling_coeff' not in v, f"Incorrect scaling_coeff"
-        assert optimizer_3.optimizer.lamb_freeze_key is False
-
-    _test_onebitlamb_checkpointing(mask1,
-                                   mask2,
-                                   args=args,
-                                   model=model,
-                                   hidden_dim=hidden_dim)
-
-
-def test_onebitlamb_checkpointing_overflow(tmpdir):
-    config_dict = {
-        "train_batch_size": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00015,
-                "weight_decay": 0.01,
-                "max_coeff": 0.3,
-                "min_coeff": 0.01,
-                "freeze_step": 2,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl",
-                "coeff_beta": 0.9,
-                "factor_max": 1.0,
-                "factor_min": 0.5,
-                "factor_threshold": 0.1
-            }
-        },
-        "gradient_clipping": 1.0,
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=100,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        save_folder = os.path.join(tmpdir, 'saved_checkpoint')
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            if dist.get_rank() == 0 and n >= 10:
-                loss = loss * 1000000.0
-            model.backward(loss)
-            dist.barrier()
-            model.step()
-            dist.barrier()
-            model.save_checkpoint(save_folder, tag=None)
-
-    _test_onebitlamb_checkpointing_overflow(args=args,
-                                            model=model,
-                                            hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('topo',
-                         [
-                             PipeTopo(num_pp=1,
-                                      num_dp=4),
-                             PipeTopo(num_pp=2,
-                                      num_dp=2),
-                             PipeTopo(num_pp=4,
-                                      num_dp=1),
-                         ])
-def test_onebitlamb_fp16_pipeline(topo, tmpdir):
-    config_dict = {
-        "train_batch_size": 16,
-        "train_micro_batch_size_per_gpu": 4,
-        "steps_per_print": 20,
-        "optimizer": {
-            "type": "OneBitLamb",
-            "params": {
-                "lr": 0.00001,
-                "betas": [0.9,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7,
-                "freeze_step": 200,
-                "cuda_aware": False,
-                "comm_backend_name": "nccl"
-            }
-        },
-        "gradient_clipping": 1.0,
-        "zero_optimization": {
-            "stage": 0
-        },
-        "fp16": {
-            "enabled": True,
-            "loss_scale": 0,
-            "initial_scale_power": 16
-        },
-        "pipeline": {
-            "seed_layers": True,
-            "activation_checkpoint_interval": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    # Allocate model for consistent initial weights.
-    init_net = AlexNetPipe()
-
-    @distributed_test(world_size=4)
-    def _helper(topo, tmpdir, steps=500):
-        assert steps >= 100
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        test_losses = train_cifar(test_model,
-                                  args,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
-
-    _helper(topo, tmpdir)
-
-
-@pytest.mark.sequential
-def test_compressed_allreduce_basic(tmpdir):
-    @distributed_test(world_size=[1, 2])
-    def _test_compressed_allreduce_basic():
-        from deepspeed.runtime.comm.nccl import NcclBackend
-        size = dist.get_world_size()
-        rank = dist.get_rank()
-        backend = NcclBackend()
-        local_rank = dist.get_rank()
-        device = torch.device("cuda", dist.get_rank())
-
-        # A simulated compression function using torch.distributed
-        def torch_sim(a):
-            a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
-            scale = a.norm() / np.sqrt(a.numel())
-            a_compressed = scale * a_sign
-            a_sign = None
-            worker_error = a - a_compressed
-            dist.all_reduce(a_compressed)
-            a_compressed.mul_(1 / dist.get_world_size())
-            a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(
-                2.0)
-            a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
-            server_scale = [
-                chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list
-            ]
-            a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
-            a_server_compressed = torch.cat(
-                [server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
-            rank = dist.get_rank()
-            server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
-            torch.cuda.synchronize()
-            torch.distributed.barrier()
-            return a_server_compressed, worker_error, server_error
-
-        tensor_size = 300 * 2**20
-        server_size = int(tensor_size / size)
-        if tensor_size % (8 * size) != 0:
-            right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
-        else:
-            right_tensor_size = tensor_size
-        right_server_size = right_tensor_size // size
-
-        # Adding bias to the initialization of the gradient we are communicating
-        # In order to get rid of the case where some elements in the gradient are too small
-        a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
-
-        worker_error = torch.zeros(right_tensor_size, device=device)
-        server_error = torch.zeros(right_server_size, device=device)
-
-        a_torch, worker_error_torch, server_error_torch = torch_sim(a)
-        torch.cuda.empty_cache()
-
-        a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
-
-        threshold = 1e-6
-        magnitude_threshold = 1e-6
-        diff_mask = (a_after - a_torch) > threshold
-        diff_server_mask = torch.chunk(diff_mask, size)[rank]
-        mpi_server = torch.chunk(a_after, size)[rank] + server_error
-        torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
-
-        # If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
-        # The test would skip those numbers that are too small in compensated_server_m
-        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
-        if torch.sum(check_mag_mask) != 0:
-            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
-        assert torch.sum(diff_server_mask) == 0 or torch.sum(check_mag_mask) == 0
-
-    _test_compressed_allreduce_basic()
diff --git a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0 b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0
deleted file mode 100644
index 25f4ac0ae26c93aace54844e1b8adc2365635d02..0000000000000000000000000000000000000000
Binary files a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370169.9dad78d721ca.29247.0 and /dev/null differ
diff --git a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0 b/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0
deleted file mode 100644
index 365f547ef19ed862babf4222d3fb52450ac14c61..0000000000000000000000000000000000000000
Binary files a/tests/unit/test_output/ds_logs/test/events.out.tfevents.1679370366.9dad78d721ca.39331.0 and /dev/null differ
diff --git a/tests/unit/test_partition.py b/tests/unit/test_partition.py
deleted file mode 100644
index f766e4596509da0d3219da0e404d0b0ef34bf56a..0000000000000000000000000000000000000000
--- a/tests/unit/test_partition.py
+++ /dev/null
@@ -1,190 +0,0 @@
-import pytest
-
-import torch
-import torch.distributed as dist
-
-from deepspeed.runtime.utils import partition_uniform
-from deepspeed.runtime.utils import partition_balanced
-from deepspeed.runtime.utils import prefix_sum_inc
-from deepspeed.runtime.utils import PartitionedTensor
-
-from .common import distributed_test
-
-
-@distributed_test(world_size=4)
-def test_partitioned_tensor():
-    world = dist.get_world_size()
-    rank = dist.get_rank()
-
-    group = dist.new_group(ranks=list(range(world)))
-
-    rows = world * 4
-    cols = 3
-
-    full = torch.rand(rows, cols).cuda()
-    dist.broadcast(full, src=0, group=group)
-    part = PartitionedTensor(full, group=group)
-
-    assert len(part.local_size()) == 1
-    assert part.local_size()[0] * world == full.numel()
-
-    reconstructed = part.full()
-    assert torch.equal(full, reconstructed)
-
-
-@distributed_test(world_size=4)
-def test_partitioned_tensor_meta():
-    world = dist.get_world_size()
-    rank = dist.get_rank()
-
-    group = dist.new_group(ranks=list(range(world)))
-
-    rows = world * 7
-    cols = 3
-
-    full = torch.rand(rows, cols).cuda()
-    dist.broadcast(full, src=0, group=group)
-    part = PartitionedTensor(full, group=group)
-
-    my_meta = PartitionedTensor.from_meta(part.to_meta(), part.local_data, group)
-    assert torch.equal(full, my_meta.full())
-
-
-def assert_valid_partition(weights, parts, P):
-    N = len(weights)
-    assert len(parts) == P + 1
-    assert parts[0] == 0
-    assert parts[P] == N
-    for idx in range(P):
-        assert parts[idx] <= parts[idx + 1]
-
-
-def get_partition_weights(weights, parts):
-    """ Return the amount of weight in each partition. """
-    costs = [0] * (len(parts) - 1)
-    P = len(parts) - 1
-    for p in range(P):
-        start = parts[p]
-        stop = parts[p + 1]
-        costs[p] = sum(weights[start:stop])
-    return costs
-
-
-def test_prefix_sum():
-    x = [3, 4, 5]
-    psum = prefix_sum_inc(x)
-    assert psum == [3, 7, 12]
-
-
-def test_valid_partition():
-    N = 10
-    P = 1
-    weights = [1] * N
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-
-
-def test_short_partition_uniform():
-    N = 2
-    P = 4
-    weights = [1] * N
-    parts = partition_uniform(len(weights), P)
-    assert_valid_partition(weights, parts, P)
-
-
-def test_short_partition():
-    N = 2
-    P = 4
-    weights = [1] * N
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-
-
-def test_easy_balance_uniform():
-    weights = [1] * 8
-    P = 4
-    parts = partition_uniform(len(weights), P)
-    assert_valid_partition(weights, parts, P)
-    costs = get_partition_weights(weights, parts)
-    assert all(c == 2 for c in costs)
-
-
-def test_easy_balance_balanced():
-    weights = [1] * 8
-    P = 4
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-    costs = get_partition_weights(weights, parts)
-    assert all(c == 2 for c in costs), costs
-
-
-def test_int_balanced():
-    weights = [0, 1, 2, 3, 3, 3]
-    P = 4
-    parts = partition_balanced(weights, P)
-    assert parts == [0, 3, 4, 5, 6]
-
-    assert_valid_partition(weights, parts, P)
-    costs = get_partition_weights(weights, parts)
-    assert all(c == 3 for c in costs)
-
-
-def test_float_balanced():
-    weights = [0., 1.1, 1.9, 3., 3., 3.]
-    P = 4
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-    assert parts == [0, 3, 4, 5, 6]
-
-
-@pytest.mark.skip(reason="Variance-minimizing partitioning returns different result.")
-def test_float_lastheavy():
-    weights = [0., 1.1, 1.9, 3., 30.]
-    P = 2
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-    assert parts == [0, 4, 5]
-
-
-def test_float_midheavy():
-    weights = [0., 1.1, 30, 3.]
-    P = 3
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
-    assert parts == [0, 2, 3, 4]
-
-
-def test_balance_bert():
-    # Parameters per layer for a transformer model with 24 transformers and hidden dim 1024
-    weights = [
-        52559872,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        12596224,
-        0,
-        52559872
-    ]
-    P = 8
-    parts = partition_balanced(weights, P)
-    assert_valid_partition(weights, parts, P)
diff --git a/tests/unit/test_pipe.py b/tests/unit/test_pipe.py
deleted file mode 100644
index f7f2b1a1eff4d6bfea06266a596eb7738a75ef12..0000000000000000000000000000000000000000
--- a/tests/unit/test_pipe.py
+++ /dev/null
@@ -1,269 +0,0 @@
-import os
-import copy
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.distributed as dist
-
-import pytest
-
-import deepspeed
-import deepspeed.runtime.utils as ds_utils
-
-
-from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
-
-PipeTopo = PipeDataParallelTopology
-from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
-
-from .common import distributed_test
-
-
-def rel_diff(A, B):
-    return abs(A - B) / abs(A)
-
-
-# All models
-from .simple_model import args_from_dict
-
-
-class AlexNet(nn.Module):
-    def __init__(self, num_classes=10):
-        super(AlexNet, self).__init__()
-        self.features = nn.Sequential(
-            nn.Conv2d(3,
-                      64,
-                      kernel_size=11,
-                      stride=4,
-                      padding=5),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(64,
-                      192,
-                      kernel_size=5,
-                      padding=2),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-            nn.Conv2d(192,
-                      384,
-                      kernel_size=3,
-                      padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(384,
-                      256,
-                      kernel_size=3,
-                      padding=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256,
-                      256,
-                      kernel_size=3,
-                      padding=1),
-            nn.ReLU(inplace=True),
-            nn.MaxPool2d(kernel_size=2,
-                         stride=2),
-        )
-        self.classifier = nn.Linear(256, num_classes)
-        self.loss_fn = nn.CrossEntropyLoss()
-
-    def forward(self, x, y):
-        x = self.features(x)
-        x = x.view(x.size(0), -1)
-        x = self.classifier(x)
-        return self.loss_fn(x, y)
-
-
-class AlexNetPipe(AlexNet):
-    def to_layers(self):
-        layers = [*self.features, lambda x: x.view(x.size(0), -1), self.classifier]
-        return layers
-
-
-class AlexNetPipeSpec(PipelineModule):
-    def __init__(self, num_classes=10, **kwargs):
-        self.num_classes = num_classes
-        specs = [
-            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=5),
-            LayerSpec(nn.ReLU, inplace=True),
-            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
-            LayerSpec(nn.Conv2d, 64, 192, kernel_size=5, padding=2),
-            F.relu,
-            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
-            LayerSpec(nn.Conv2d, 192, 384, kernel_size=3, padding=1),
-            F.relu,
-            LayerSpec(nn.Conv2d, 384, 256, kernel_size=3, padding=1),
-            F.relu,
-            LayerSpec(nn.Conv2d, 256, 256, kernel_size=3, padding=1),
-            F.relu,
-            LayerSpec(nn.MaxPool2d, kernel_size=2, stride=2),
-
-            lambda x: x.view(x.size(0), -1),
-            LayerSpec(nn.Linear, 256, self.num_classes), # classifier
-        ]
-        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
-
-
-def cifar_trainset(fp16=False):
-    import torchvision
-    import torchvision.transforms as transforms
-
-    transform_list = [
-        transforms.ToTensor(),
-        transforms.Normalize((0.5,
-                              0.5,
-                              0.5),
-                             (0.5,
-                              0.5,
-                              0.5)),
-    ]
-    if fp16:
-        transform_list.append(torchvision.transforms.Lambda(lambda x: x.half()))
-
-    transform = transforms.Compose(transform_list)
-
-    local_rank = torch.cuda.current_device()
-
-    # Only one rank per machine downloads.
-    dist.barrier()
-    if local_rank != 0:
-        dist.barrier()
-    trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10-data',
-                                            train=True,
-                                            download=True,
-                                            transform=transform)
-    if local_rank == 0:
-        dist.barrier()
-    return trainset
-
-
-def train_cifar(model, args, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
-    with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
-        ds_utils.set_random_seed(seed)
-
-        # disable dropout
-        model.eval()
-
-        trainset = cifar_trainset(fp16=fp16)
-        args.local_rank = dist.get_rank()
-
-        engine, _, _, _ = deepspeed.initialize(
-            args=args,
-            model=model,
-            model_parameters=[p for p in model.parameters()],
-            training_data=trainset)
-
-        losses = []
-        for step in range(num_steps):
-            loss = engine.train_batch()
-            losses.append(loss.item())
-            if step % 50 == 0 and dist.get_rank() == 0:
-                print(f'STEP={step} LOSS={loss.item()}')
-
-        if average_dp_losses:
-            loss_tensor = torch.tensor(losses).cuda()
-            dist.all_reduce(loss_tensor)
-            loss_tensor /= dist.get_world_size()
-            losses = loss_tensor.tolist()
-
-    return losses
-
-
-@pytest.mark.skip(reason="been seeing nondeterministic failures, skipping for now")
-@pytest.mark.parametrize('topo',
-                         [
-                             PipeTopo(num_pp=1,
-                                      num_dp=4),
-                             PipeTopo(num_pp=2,
-                                      num_dp=2),
-                             PipeTopo(num_pp=4,
-                                      num_dp=1),
-                         ])
-def test_pipe_cifar10(topo, tmpdir):
-    config_dict = {
-        "train_batch_size": 16,
-        "train_micro_batch_size_per_gpu": 4,
-        "steps_per_print": 20,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.001,
-                "betas": [0.9,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "fp16": {
-            "enabled": False
-        },
-        "pipeline": {
-            "seed_layers": True,
-            "activation_checkpoint_interval": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-
-    # Allocate model for consistent initial weights.
-    init_net = AlexNetPipe()
-
-    @distributed_test(world_size=4)
-    def _helper(topo, tmpdir, steps=500):
-        assert steps >= 100
-
-        base_net = copy.deepcopy(init_net)
-        base_model = PipelineModule(layers=base_net.to_layers(),
-                                    num_stages=1,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        # Train with just data parallelism
-        base_losses = train_cifar(base_model,
-                                  args,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
-
-        test_net = copy.deepcopy(init_net)
-        test_model = PipelineModule(layers=test_net.to_layers(),
-                                    topology=topo,
-                                    loss_fn=nn.CrossEntropyLoss())
-
-        #test_model = AlexNetPipe(num_classes=10,
-        #                         topology=test_topo,
-        #                         seed_layers=config_dict['pipeline']['seed_layers'])
-        test_losses = train_cifar(test_model,
-                                  args,
-                                  num_steps=steps,
-                                  fp16=config_dict['fp16']['enabled'])
-
-        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
-        rel_diffs = [rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)]
-        if dist.get_rank() == 0:
-            print(
-                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
-            )
-            print(
-                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
-            )
-            print(
-                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
-            )
-
-            for lastX in [1, 10, 100]:
-                base_avg = sum(base_losses[-lastX:]) / lastX
-                test_avg = sum(test_losses[-lastX:]) / lastX
-                print(
-                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
-                )
-
-        lastX = 100
-        base = base_losses[-lastX:]
-        base_avg = sum(base) / len(base)
-        test = test_losses[-lastX:]
-        test_avg = sum(test) / len(test)
-        assert rel_diff(base_avg, test_avg) < 0.03
-
-    _helper(topo, tmpdir)
diff --git a/tests/unit/test_pipe_module.py b/tests/unit/test_pipe_module.py
deleted file mode 100644
index 281101492c37585df23fe0d7e5292141be987355..0000000000000000000000000000000000000000
--- a/tests/unit/test_pipe_module.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import copy
-
-import torch
-import torch.nn as nn
-import torch.distributed as dist
-
-import pytest
-
-import deepspeed
-
-from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
-
-PipeTopo = PipeDataParallelTopology
-
-from deepspeed.pipe import PipelineModule, LayerSpec
-from deepspeed.utils import RepeatingLoader
-
-from .common import distributed_test
-from .simple_model import args_from_dict
-
-HIDDEN_DIM = 32
-LAYERS = 8
-
-
-@pytest.fixture
-def sequential_model():
-    model = torch.nn.Sequential(
-        *[nn.Linear(HIDDEN_DIM,
-                    HIDDEN_DIM) for _ in range(LAYERS)],
-        nn.Linear(HIDDEN_DIM,
-                  1),
-    )
-    return model
-
-
-@pytest.fixture
-def simple_args(tmpdir):
-    config_dict = {
-        "train_batch_size": 1,
-        "train_micro_batch_size_per_gpu": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 0.001,
-                "betas": [0.9,
-                          0.999],
-                "eps": 1e-8,
-                "weight_decay": 3e-7
-            }
-        },
-        "pipeline": {
-            "activation_checkpoint_interval": 1
-        }
-    }
-    args = args_from_dict(tmpdir, config_dict)
-    return args
-
-
-def test_pipe_module_sequential(sequential_model, simple_args):
-    batch_input = torch.randn(1, HIDDEN_DIM)
-
-    @distributed_test(world_size=4)
-    def _helper():
-        base_model = copy.deepcopy(sequential_model)
-        base_input = batch_input.clone().detach()
-        base_output = base_model(base_input)
-        base_output = base_output
-        base_params = sum(p.numel() for p in base_model.parameters())
-
-        pipe_model = copy.deepcopy(sequential_model)
-        pipe_model = PipelineModule(layers=pipe_model, num_stages=4)
-
-        # Ensure all parameters are accounted for.
-        my_params = sum(p.numel() for p in pipe_model.parameters())
-        total_pipe_params = torch.LongTensor([my_params]).to('cuda')
-        dist.all_reduce(total_pipe_params)
-        total_pipe_params = total_pipe_params.item()
-        assert total_pipe_params == base_params
-
-        pipe_model, _, _, _ = deepspeed.initialize(
-            args=simple_args,
-            model=pipe_model,
-            model_parameters=[p for p in pipe_model.parameters()])
-
-        if pipe_model.is_first_stage or pipe_model.is_last_stage:
-            pipe_input = base_input.clone().detach().to('cuda')
-            # label 0 is meaningless
-            dataset = [(pipe_input, 0)]
-            loader = RepeatingLoader(dataset)
-            data_iter = iter(loader)
-        else:
-            data_iter = None
-
-        pipe_output = pipe_model.eval_batch(data_iter=data_iter)
-
-        base_output = base_output.to('cpu')
-        pipe_output = pipe_output.to('cpu')
-
-        assert torch.allclose(base_output, pipe_output, atol=1e-4)
-
-    _helper()
diff --git a/tests/unit/test_pipe_schedule.py b/tests/unit/test_pipe_schedule.py
deleted file mode 100644
index 8c65f3d7614be291ca9b194d3b746f8a68679d5c..0000000000000000000000000000000000000000
--- a/tests/unit/test_pipe_schedule.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import pytest
-
-import deepspeed.runtime.pipe.schedule as schedule
-
-
-def _count_type(cmds, classtype):
-    return len(list(filter(lambda c: type(c) == classtype, cmds)))
-
-
-def test_pipe_inference_schedule_singlestage():
-    sched = schedule.InferenceSchedule(micro_batches=4, stages=1, stage_id=0)
-    assert sched.num_micro_batches == 4
-    full = list(iter(sched))
-    for idx, cmds in enumerate(full):
-        assert len(cmds) == 2
-        assert type(cmds[0]) == schedule.LoadMicroBatch
-        assert type(cmds[1]) == schedule.ForwardPass
-        assert cmds[0].buffer_id == cmds[1].buffer_id
-    assert len(full) == sched.num_micro_batches
-
-
-def test_pipe_train_schedule_singlestage():
-    sched = schedule.TrainSchedule(micro_batches=4, stages=1, stage_id=0)
-    assert sched.num_micro_batches == 4
-    full = list(iter(sched))
-    print()
-    for idx, cmds in enumerate(full):
-        print(idx, cmds)
-        #assert len(cmds) == 2
-        #assert type(cmds[0]) == schedule.LoadMicroBatch
-        #assert type(cmds[1]) == schedule.ForwardPass
-        #assert cmds[0].buffer_id == cmds[1].buffer_id
-    #assert len(full) == sched.num_micro_batches
-
-
-@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
-def test_pipe_inference_schedule_firststage(micro_batches, stages=3, verbose=False):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=0)
-    assert sched.num_micro_batches == micro_batches
-    full = list(iter(sched))
-    if verbose:
-        print()
-    for idx, cmds in enumerate(full):
-        if verbose:
-            print(idx, cmds)
-        # Ensure we don't send an activation the first step
-        if idx == 0:
-            assert len(cmds) == 2
-            assert type(cmds[0]) == schedule.LoadMicroBatch
-            assert type(cmds[1]) == schedule.ForwardPass
-            assert cmds[0].buffer_id == cmds[1].buffer_id
-            continue
-
-        # the last active step is only a send
-        if idx == sched.num_micro_batches:
-            assert len(cmds) == 1
-            assert type(cmds[0]) == schedule.SendActivation
-            continue
-
-        # no work later on
-        if idx > sched.num_micro_batches:
-            assert len(cmds) == 0
-            continue
-
-        # Normally we need to load/forward/send
-        assert len(cmds) == 3
-        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
-        assert _count_type(cmds, schedule.ForwardPass) == 1
-        assert _count_type(cmds, schedule.SendActivation) == 1
-    assert len(full) == micro_batches + stages - 1
-
-
-@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
-def test_pipe_inference_schedule_midstage(micro_batches, stages=3, verbose=False):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=1)
-
-    full = list(iter(sched))
-    if verbose:
-        print()
-    for idx, cmds in enumerate(full):
-        if verbose:
-            print(idx, cmds)
-        if idx < sched.stage:
-            assert len(cmds) == 0
-            continue
-        if idx == sched.stage + sched.num_micro_batches:
-            assert len(cmds) == 1
-            assert type(cmds[0]) == schedule.SendActivation
-            continue
-        if idx > sched.stage + sched.num_micro_batches:
-            assert len(cmds) == 0
-            continue
-        assert _count_type(cmds, schedule.LoadMicroBatch) == 0
-        assert _count_type(cmds, schedule.ForwardPass) == 1
-        assert _count_type(cmds, schedule.RecvActivation) == 1
-        if idx > sched.stage:
-            assert _count_type(cmds, schedule.SendActivation) == 1
-    assert len(full) == micro_batches + stages - 1
-
-
-@pytest.mark.parametrize('micro_batches', [1, 3, 8, 10])
-def test_pipe_inference_schedule_laststage(micro_batches, stages=3, verbose=False):
-    sched = schedule.InferenceSchedule(micro_batches=micro_batches,
-                                       stages=stages,
-                                       stage_id=2)
-    full = list(iter(sched))
-    if verbose:
-        print()
-    for idx, cmds in enumerate(full):
-        if verbose:
-            print(idx, cmds)
-        if idx < sched.stage or idx > sched.stage + sched.num_micro_batches:
-            assert len(cmds) == 0
-            continue
-        assert _count_type(cmds, schedule.LoadMicroBatch) == 1
-        assert _count_type(cmds, schedule.ForwardPass) == 1
-        assert _count_type(cmds, schedule.RecvActivation) == 1
-        assert _count_type(cmds, schedule.SendActivation) == 0
-    assert len(full) == micro_batches + stages - 1
-
-
-def test_pipe_schedule_firststage():
-    sched = schedule.TrainSchedule(micro_batches=8, stages=3, stage_id=0)
-    for cmds in sched:
-        assert all(instr.__class__ != schedule.SendGrad for instr in cmds)
-        assert all(instr.__class__ != schedule.RecvActivation for instr in cmds)
-        for instr in cmds:
-            if isinstance(instr, schedule.BufferOpInstruction):
-                assert 0 <= instr.buffer_id < sched.num_pipe_buffers()
-
-
-def test_pipe_schedule_laststage():
-    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
-    #assert len(sched) == 2 * (sched.micro_batches + sched.stages - 1)
-    print()
-    for cmds in sched:
-        print(cmds)
-        assert all(instr.__class__ != schedule.SendActivation for instr in cmds)
-        assert all(instr.__class__ != schedule.RecvGrad for instr in cmds)
-
-
-def test_pipe_stagequery():
-    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=0)
-    assert sched.is_first_stage
-    assert not sched.is_last_stage
-
-    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=1)
-    assert not sched.is_first_stage
-    assert not sched.is_last_stage
-
-    sched = schedule.TrainSchedule(stages=3, micro_batches=4, stage_id=2)
-    assert not sched.is_first_stage
-    assert sched.is_last_stage
diff --git a/tests/unit/test_pld.py b/tests/unit/test_pld.py
deleted file mode 100644
index 5d275d16379cf1fdc3ae596e1bfce22a2b71189c..0000000000000000000000000000000000000000
--- a/tests/unit/test_pld.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import numpy as np
-import deepspeed
-import pytest
-from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
-
-from .common import distributed_test
-from .simple_model import SimpleModel, PLD_SimpleModel, SimpleOptimizer, random_dataloader, args_from_dict
-
-
-@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
-def test_pld_schedule(tmpdir, theta):
-    gamma = 0.001
-
-    pld_scheduler = ProgressiveLayerDrop(theta, gamma)
-    for i in range(10):
-        pld_scheduler.update_state(i)
-        expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
-        actual_theta = pld_scheduler.get_theta()
-        assert expected_theta == actual_theta
-
-
-@pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
-def test_pld_model(tmpdir, theta):
-    gamma = 0.001
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.0001
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "progressive_layer_drop": {
-            "enabled": True,
-            "theta": theta,
-            "gamma": gamma
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = PLD_SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_pld_model(args, model, hidden_dim, theta, gamma):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-            expected_theta = (1. - theta) * np.exp(-gamma * i) + theta
-            actual_theta = model.get_pld_theta()
-            assert expected_theta == actual_theta
-
-    _test_pld_model(args=args,
-                    model=model,
-                    hidden_dim=hidden_dim,
-                    theta=theta,
-                    gamma=gamma)
-
-
-def test_non_pld_model(tmpdir):
-    gamma = 0.001
-    theta = 0.5
-    config_dict = {
-        "train_batch_size": 1,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": 'Adam',
-            "params": {
-                "lr": 0.0001
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "progressive_layer_drop": {
-            "enabled": True,
-            "theta": theta,
-            "gamma": gamma
-        }
-    }
-
-    args = args_from_dict(tmpdir, config_dict)
-    hidden_dim = 10
-
-    model = SimpleModel(hidden_dim, empty_grad=False)
-
-    @distributed_test(world_size=[1])
-    def _test_non_pld_model(args, model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(args=args,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        data_loader = random_dataloader(model=model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            with pytest.raises(TypeError):
-                loss = model(batch[0], batch[1])
-
-    _test_non_pld_model(args=args, model=model, hidden_dim=hidden_dim)
diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py
deleted file mode 100644
index f2b0a8b2018a244becd85f8708b7b6b19be79ae4..0000000000000000000000000000000000000000
--- a/tests/unit/test_run.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import pytest
-
-from deepspeed.launcher import runner as dsrun
-
-
-def test_parser_mutual_exclusive():
-    '''Ensure dsrun.parse_resource_filter() raises a ValueError when include_str and
-    exclude_str are both provided.
-    '''
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter({}, include_str='A', exclude_str='B')
-
-
-def test_parser_local():
-    ''' Test cases with only one node. '''
-    # First try no include/exclude
-    hosts = {'worker-0': [0, 1, 2, 3]}
-    ret = dsrun.parse_resource_filter(hosts)
-    assert (ret == hosts)
-
-    # exclude slots
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:1')
-    assert (ret == {'worker-0': [0, 2, 3]})
-
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:1,2')
-    assert (ret == {'worker-0': [0, 3]})
-
-    # only use one slot
-    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1')
-    assert (ret == {'worker-0': [1]})
-
-    # including slots multiple times shouldn't break things
-    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1,1')
-    assert (ret == {'worker-0': [1]})
-    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0:1@worker-0:0,1')
-    assert (ret == {'worker-0': [0, 1]})
-
-    # including just 'worker-0' without : should still use all GPUs
-    ret = dsrun.parse_resource_filter(hosts, include_str='worker-0')
-    assert (ret == hosts)
-
-    # excluding just 'worker-0' without : should eliminate everything
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0')
-    assert (ret == {})
-
-    # exclude all slots manually
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:0,1,2,3')
-    assert (ret == {})
-
-
-def test_parser_multinode():
-    # First try no include/exclude
-    hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
-    ret = dsrun.parse_resource_filter(hosts)
-    assert (ret == hosts)
-
-    # include a node
-    ret = dsrun.parse_resource_filter(hosts, include_str='worker-1:0,3')
-    assert (ret == {'worker-1': [0, 3]})
-
-    # exclude a node
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-1')
-    assert (ret == {'worker-0': [0, 1, 2, 3]})
-
-    # exclude part of each node
-    ret = dsrun.parse_resource_filter(hosts, exclude_str='worker-0:0,1@worker-1:3')
-    assert (ret == {'worker-0': [2, 3], 'worker-1': [0, 1, 2]})
-
-
-def test_parser_errors():
-    '''Ensure we catch errors. '''
-    hosts = {'worker-0': [0, 1, 2, 3], 'worker-1': [0, 1, 2, 3]}
-
-    # host does not exist
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter(hosts, include_str='jeff')
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter(hosts, exclude_str='jeff')
-
-    # slot does not exist
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter(hosts, include_str='worker-1:4')
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter(hosts, exclude_str='worker-1:4')
-
-    # formatting
-    with pytest.raises(ValueError):
-        dsrun.parse_resource_filter(hosts, exclude_str='worker-1@worker-0:1@5')
-
-
-def test_num_plus_parser():
-    ''' Ensure we catch errors relating to num_nodes/num_gpus + -i/-e being mutually exclusive'''
-
-    # inclusion
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_nodes 1 -i localhost foo.py".split())
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_nodes 1 --num_gpus 1 -i localhost foo.py".split())
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_gpus 1 -i localhost foo.py".split())
-
-    # exclusion
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_nodes 1 -e localhost foo.py".split())
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_nodes 1 --num_gpus 1 -e localhost foo.py".split())
-    with pytest.raises(ValueError):
-        dsrun.main(args="--num_gpus 1 -e localhost foo.py".split())
diff --git a/tests/unit/test_runtime_utils.py b/tests/unit/test_runtime_utils.py
deleted file mode 100644
index fb5c8e39454624e2e761bf6eecf9398caf21c64c..0000000000000000000000000000000000000000
--- a/tests/unit/test_runtime_utils.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from deepspeed.moe.utils import is_moe_param, split_params_grads_into_shared_and_expert_params, split_params_into_shared_and_expert_params
-import torch
-from torch._utils import _flatten_dense_tensors
-import torch.distributed as dist
-import pytest
-
-import deepspeed.runtime.utils as ds_utils
-from deepspeed.utils.logging import log_dist
-import deepspeed.utils.groups as groups
-
-from .common import distributed_test
-
-
-def test_call_to_str():
-    c2s = ds_utils.call_to_str
-
-    assert c2s('int') == 'int()'
-    assert c2s('int', 3) == 'int(3)'
-    assert c2s('int', 3, 'jeff') == 'int(3, \'jeff\')'
-
-    assert c2s('hello', val=3) == 'hello(val=3)'
-    assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
-
-
-def test_clip_grad_norm_():
-    @distributed_test(world_size=[2])
-    def _test_clip_grad_norm_() -> None:
-        param1 = torch.nn.Parameter(torch.Tensor([0]))
-        param1.grad = torch.Tensor([1])
-        param2 = torch.nn.Parameter(torch.Tensor([0]))
-        param2.grad = torch.Tensor([dist.get_rank() + 1])
-        # param2 is now MoE parameter
-        param2.allreduce = False
-
-        parameters = [param1, param2]
-
-        groups._create_expert_and_data_parallel(2)
-
-        norm = ds_utils.clip_grad_norm_(parameters, max_norm=0.1)
-        norm = torch.Tensor([norm]).to(dist.get_rank())
-
-        world_size = dist.get_world_size()
-        gathered_norm = [torch.zeros(1).cuda() for i in range(world_size)]
-
-        torch.distributed.all_gather(gathered_norm, norm)
-
-        assert gathered_norm[0] == gathered_norm[1], "norm at rank 0 does not match the norm at rank 1"
-
-    return _test_clip_grad_norm_()
-
-
-@pytest.mark.parametrize("check_using_norm", [(False), (True)])
-def test_CheckOverflow(check_using_norm):
-    @distributed_test(world_size=[2])
-    def _test_CheckOverflow(check_using_norm: bool):
-        groups._create_expert_and_data_parallel(2)
-
-        param1 = torch.nn.Parameter(torch.Tensor([0]))
-        param1.grad = torch.Tensor([1])
-        param2 = torch.nn.Parameter(torch.Tensor([0]))
-        if dist.get_rank() == 0:
-            param2.grad = torch.Tensor([1])
-        else:
-            param2.grad = torch.Tensor([float("inf")])
-        param2.allreduce = False
-        # param2 is now MoE parameter
-        parameters = [param1, param2]
-        if check_using_norm:
-            grads_group_flat = [_flatten_dense_tensors([p.grad for p in parameters])]
-            norm = ds_utils.get_weight_norm(grads_group_flat)
-            overflow_checker = ds_utils.CheckOverflow([parameters])
-            overflow = overflow_checker.check_using_norm([norm], reduce_overflow=False)
-        else:
-            overflow_checker = ds_utils.CheckOverflow([parameters])
-            overflow = overflow_checker.check()
-        assert overflow
-
-    return _test_CheckOverflow(check_using_norm)
diff --git a/tests/unit/test_sparse_attention.py b/tests/unit/test_sparse_attention.py
deleted file mode 100644
index 531524e4542182f2d7d0dcdb48ade1ef35fdd68d..0000000000000000000000000000000000000000
--- a/tests/unit/test_sparse_attention.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# DeepSpeed note, some parts of code taken & adapted from commit c368a9fd1b2c9dee4cc94de9a6bb0be3d447be41
-# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_softmax.py
-# https://github.com/ptillet/torch-blocksparse/blob/master/tests/test_matmul.py
-# https://github.com/ptillet/torch-blocksparse/blob/master/tests/utils
-
-import pytest
-import torch
-import deepspeed
-from deepspeed.ops.op_builder import SparseAttnBuilder
-
-if not deepspeed.ops.__compatible_ops__[SparseAttnBuilder.NAME]:
-    pytest.skip("sparse attention op is not compatible on this system",
-                allow_module_level=True)
-
-
-def test_sparse_attention_module_availability():
-    return True
-    try:
-        from deepspeed.ops import sparse_attention
-    except ImportError:
-        print("Sparse Attention Module is not installed!")
-        return False
-    return True
-
-
-def test_matmul_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention.matmul import MatMul
-    except ImportError:
-        print("Sparse MatMul Module is not installed!")
-        return False
-    return True
-
-
-def test_softmax_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention.softmax import Softmax
-    except ImportError:
-        print("Sparse Softmax Module is not installed!")
-        return False
-    return True
-
-
-def test_sparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import SparsityConfig
-    except ImportError:
-        print("SparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_densesparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import DenseSparsityConfig
-    except ImportError:
-        print("DenseSparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_fixedsparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import FixedSparsityConfig
-    except ImportError:
-        print("FixedSparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_variablesparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import VariableSparsityConfig
-    except ImportError:
-        print("VariableSparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_bigbirdsparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import BigBirdSparsityConfig
-    except ImportError:
-        print("BigBirdSparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_bslongformersparsityconfig_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import BSLongformerSparsityConfig
-    except ImportError:
-        print("BSLongformerSparsityConfig Module is not installed!")
-        return False
-    return True
-
-
-def test_sparseselfattention_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import SparseSelfAttention
-    except ImportError:
-        print("SparseSelfAttention Module is not installed!")
-        return False
-    return True
-
-
-def test_bertsparseselfattention_module_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import BertSparseSelfAttention
-    except ImportError:
-        print("BertSparseSelfAttention Module is not installed!")
-        return False
-    return True
-
-
-def test_sparseattentionutils_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import SparseAttentionUtils
-    except ImportError:
-        print("SparseAttentionUtils Module is not installed!")
-        return False
-    return True
-
-
-def test_cpp_utils_availability():
-    return True
-    try:
-        from deepspeed.ops.sparse_attention import cpp_utils
-    except ImportError:
-        print("Sparse Attention cpp_utils Module is not installed!")
-        return False
-    return True
-
-
-def dense_to_sparse(w, mask, block):
-    """Converts dense matrix with explicit zeros to sparse matrix
-    """
-    Z = w.size(0)
-    ret = torch.empty((Z, mask.sum(), block, block), dtype=w.dtype, device=w.device)
-    nnz = mask.nonzero()
-    h, i, j = nnz[:, 0], nnz[:, 1], nnz[:, 2]
-    for zz in range(Z):
-        for idx, (hh, ii, jj) in enumerate(zip(h, i, j)):
-            ret[zz, idx, :, :] = w[zz, hh, ii*block: (ii+1)*block, jj*block: (jj+1)*block]
-    return ret
-
-
-def sparse_to_dense(w, mask, block, zero=0):
-    """Converts sparse matrix to dense matrix with explicit zeros
-    """
-    maskedw = w.clone()
-    for bz, wz in enumerate(range(0, w.size(0))):
-        for bh, wh in enumerate(range(0, w.size(1))):
-            for bi, wi in enumerate(range(0, w.size(2), block)):
-                for bj, wj in enumerate(range(0, w.size(3), block)):
-                    if mask[bh, bi, bj] == 0:
-                        maskedw[wz, wh, wi:wi + block, wj:wj + block] = zero
-                    #maskedw[wz, wh, wi : wi+block, wj : wj+block] *= mask[bh, bi, bj]
-    return maskedw
-
-
-def allclose(x, y):
-    assert x.dtype == y.dtype
-    rtol, atol = {torch.float32: (5e-4, 5e-5), torch.float16: (3e-2, 2e-3)}[x.dtype]
-    return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
-def make_layout(rho, shape):
-    probs = torch.Tensor([rho, 1 - rho])
-    generator = torch.distributions.categorical.Categorical(probs)
-    layout = generator.sample(shape)
-    return layout
-
-
-def run_softmax_reference(x, scale, dx, kp_mask, attn_mask, layout, block):
-    x = sparse_to_dense(x, layout, block, zero=float('-inf'))
-    x.retain_grad()
-    if kp_mask is not None:
-        bcattn_mask = attn_mask[None, None, :, :] + torch.zeros_like(x)
-        x[bcattn_mask == 0] = float('-inf')
-        y = torch.softmax(x * scale + kp_mask[:, None, None, :], -1)
-    else:
-        y = torch.softmax(x * scale, -1)
-    y.backward(dx)
-    dx = x.grad.clone()
-    dx = dense_to_sparse(dx, layout, block)
-    y = dense_to_sparse(y, layout, block)
-    return y, dx
-
-
-def run_softmax_sparse(x, scale, dx, kp_mask, attn_mask, layout, block):
-    from deepspeed.ops.sparse_attention.softmax import Softmax
-    sparse_softmax = Softmax(layout, block, bench=False)
-
-    dx = dense_to_sparse(dx, layout, block)
-    x = dense_to_sparse(x, layout, block)
-    x.retain_grad()
-    y = sparse_softmax(x,
-                       scale=scale,
-                       key_padding_mask=kp_mask,
-                       key_padding_mask_mode='add',
-                       attn_mask=attn_mask,
-                       attn_mask_mode='mul')
-    y.backward(dx)
-    dx = x.grad.clone()
-    x.grad.zero_()
-    return x, dx
-
-
-def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layout=None):
-    if layout is None:
-        layout = make_layout(rho, (H, M // block, N // block))
-    if dense_x:
-        x = torch.rand((Z, H, M, N), dtype=dtype, requires_grad=True, device='cuda')
-    else:
-        x = torch.rand((Z,
-                        layout.sum(),
-                        block,
-                        block),
-                       dtype=dtype,
-                       requires_grad=True,
-                       device='cuda')
-    dx = torch.rand_like(x)
-    bool_attn_mask = torch.randint(low=0,
-                                   high=2,
-                                   size=(N,
-                                         N),
-                                   dtype=torch.bool,
-                                   requires_grad=False,
-                                   device='cuda')
-    fp_attn_mask = bool_attn_mask.type(dtype)
-    kp_mask = torch.randint(low=0,
-                            high=2,
-                            size=(Z,
-                                  N),
-                            dtype=dtype,
-                            requires_grad=False,
-                            device='cuda')
-    kp_mask[kp_mask == 1.] = float('-inf')
-    return layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask
-
-
-def _skip_on_cuda_compatability():
-    if torch.cuda.get_device_capability()[0] < 7:
-        pytest.skip("needs higher compute capability than 7")
-    cuda_major = int(torch.version.cuda.split('.')[0]) * 10
-    cuda_minor = int(torch.version.cuda.split('.')[1])
-    cuda_version = cuda_major + cuda_minor
-    if (cuda_version != 101 and cuda_version != 102) and \
-            (cuda_version != 111 and cuda_version != 110):
-        pytest.skip("requires cuda 10.1 or 10.2 or 11.0 or 11.1")
-
-
-@pytest.mark.parametrize("block", [16, 32])
-@pytest.mark.parametrize("width", [256, 576])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
-def test_softmax(block, width, dtype):
-    _skip_on_cuda_compatability()
-    Z = 2
-    H = 4
-    scale = 0.4
-    rho = 0.4
-    M = N = width
-    layout, x, dx, bool_attn_mask, fp_attn_mask, kp_mask = init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, layout=None)
-    ref_y, ref_dx = run_softmax_reference(x, scale, dx, kp_mask, bool_attn_mask, layout, block)
-    st_y, st_dx = run_softmax_sparse(x, scale, dx, kp_mask, fp_attn_mask, layout, block)
-
-    assert allclose(ref_y, st_y)
-    assert allclose(ref_dx, st_dx)
-
-
-def run_matmul_reference(x, w, mode, trans_a, trans_b, layout, block, dy):
-    x = sparse_to_dense(x, layout, block) if mode == 'dsd' else x
-    w = sparse_to_dense(w, layout, block) if mode == 'dds' else w
-    x.retain_grad()
-    w.retain_grad()
-    xx = x.transpose(2, 3) if trans_a else x
-    ww = w.transpose(2, 3) if trans_b else w
-    y = torch.matmul(xx, ww)
-    y = sparse_to_dense(y, layout, block) if mode == 'sdd' else y
-    y.backward(dy)
-    dx = x.grad.clone()
-    dw = w.grad.clone()
-    x.grad.zero_()
-    w.grad.zero_()
-    y = dense_to_sparse(y, layout, block) if mode == 'sdd' else y
-    dx = dense_to_sparse(dx, layout, block) if mode == 'dsd' else dx
-    dw = dense_to_sparse(dw, layout, block) if mode == 'dds' else dw
-    return y, dx, dw
-
-
-def run_matmul_sparse(x, w, mode, trans_a, trans_b, layout, block, dy):
-    from deepspeed.ops.sparse_attention.matmul import MatMul
-    x = dense_to_sparse(x, layout, block) if mode == 'dsd' else x
-    w = dense_to_sparse(w, layout, block) if mode == 'dds' else w
-    dy = dense_to_sparse(dy, layout, block) if mode == 'sdd' else dy
-    op = MatMul(layout, block, mode, trans_a=trans_a, trans_b=trans_b)
-    x.retain_grad()
-    w.retain_grad()
-    y = op(x, w)
-    y.backward(dy)
-    dx = x.grad.clone()
-    dw = w.grad.clone()
-    x.grad.zero_()
-    return y, dx, dw
-
-
-def init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout):
-    torch.manual_seed(1)
-    AS0 = K if trans_a else M
-    AS1 = M if trans_a else K
-    BS0 = N if trans_b else K
-    BS1 = K if trans_b else N
-    shape = {'sdd': (M, N), 'dsd': (AS0, AS1), 'dds': (BS0, BS1)}[mode]
-    x = torch.rand((Z, H, AS0, AS1), dtype=dtype, requires_grad=True, device='cuda')
-    w = torch.rand((Z, H, BS0, BS1), dtype=dtype, requires_grad=True, device='cuda')
-    dy = torch.rand((Z, H, M, N), dtype=dtype, device='cuda')
-    if layout is None:
-        layout = make_layout(rho, (H, shape[0] // block, shape[1] // block))
-    else:
-        assert list(layout.shape) == [H, shape[0] // block, shape[1] // block]
-    x.retain_grad()
-    w.retain_grad()
-    return x, w, dy, shape, layout
-
-testdata = [
-      (16, dtype, mode, trans_a, trans_b)\
-         for dtype in [torch.float16]\
-         for mode in ['sdd', 'dds']\
-         for trans_a   in [False]\
-         for trans_b   in [False, True]\
-   ] + [
-      (16, dtype, mode, trans_a, trans_b)\
-         for dtype in [torch.float16]\
-         for mode in ['dsd']\
-         for trans_a   in [False, True]\
-         for trans_b   in [False]\
-   ] + [
-      (16, dtype, mode, trans_a, trans_b)\
-         for dtype in [torch.float32]\
-         for mode in ['sdd', 'dsd', 'dds']\
-         for trans_a   in [False]\
-         for trans_b   in [False]\
-   ] + [
-      (block, torch.float16, mode, False, False)\
-         for block in [16, 32, 64]\
-         for mode in ['sdd', 'dsd', 'dds']\
-   ]
-
-
-@pytest.mark.parametrize("block, dtype, mode, trans_a, trans_b", testdata)
-def test_matmul(block, dtype, mode, trans_a, trans_b):
-    _skip_on_cuda_compatability()
-    Z = 3
-    H = 2
-    M = 128
-    N = 256
-    K = 192
-    rho = 0.5
-    x, w, dy, shape, layout = init_matmul_inputs(Z, H, M, N, K, rho, mode, trans_a, trans_b, block, dtype, layout=None)
-    ref_y, ref_dx, ref_dw = run_matmul_reference(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
-    st_y, st_dx, st_dw = run_matmul_sparse(x.clone(), w.clone(), mode, trans_a, trans_b, layout, block, dy)
-    assert allclose(ref_y, st_y)
-    assert allclose(ref_dx, st_dx)
-    assert allclose(ref_dw, st_dw)
diff --git a/tests/unit/test_sparse_grads.py b/tests/unit/test_sparse_grads.py
deleted file mode 100644
index 2506a1d4c8eb9160f746e31bf2bd656c9d529c3a..0000000000000000000000000000000000000000
--- a/tests/unit/test_sparse_grads.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import torch
-import torch.distributed as dist
-import deepspeed
-import pytest
-from .common import distributed_test
-
-import deepspeed.utils.groups as groups
-
-
-def test_sparse_adam(tmpdir):
-    config_dict = {"train_batch_size": 2, "steps_per_print": 1, "sparse_gradients": True}
-
-    class Model(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.emb = torch.nn.EmbeddingBag(10, 3, mode="sum", sparse=True)
-            self.linear = torch.nn.Linear(3, 1)
-
-        def forward(self, x, offsets):
-            return self.linear(self.emb(x, offsets))
-
-    class Adam(torch.optim.Optimizer):
-        def __init__(self, dense_params, sparse_params):
-            super().__init__(dense_params + sparse_params, defaults={})
-            self.adam = torch.optim.Adam(dense_params)
-            self.adam_sparse = torch.optim.SparseAdam(sparse_params)
-
-        @torch.no_grad()
-        def step(self, closure=None):
-            loss_1 = self.adam.step(closure)
-            loss_2 = self.adam_sparse.step(closure)
-
-            if loss_1 is not None and loss_2 is not None:
-                return loss_1 + loss_2
-            return loss_1 or loss_2
-
-    model = Model()
-    optimizer = Adam(list(model.linear.parameters()), list(model.emb.parameters()))
-
-    @distributed_test(world_size=[2])
-    def _test(model, optimizer):
-        engine, _, _, _ = deepspeed.initialize(model=model,
-                                              optimizer=optimizer,
-                                              config=config_dict)
-        loss = torch.nn.BCEWithLogitsLoss()
-        x = torch.tensor([1,
-                          2,
-                          4,
-                          5,
-                          4,
-                          3,
-                          2,
-                          9],
-                         dtype=torch.long,
-                         device=engine.device)
-        offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device)
-        y = torch.tensor([[1.0], [0.0]], device=engine.device)
-        res = engine(x, offsets)
-        engine.backward(loss(res, y))
-        engine.step()
-
-        results = [
-            engine.all_gather_scalar(i,
-                                     groups._get_data_parallel_group())
-            for i in model.emb.parameters()
-        ]
-        for res in results:
-            assert torch.allclose(res[0], res[1])
-
-    _test(model, optimizer)
diff --git a/tests/unit/test_topology.py b/tests/unit/test_topology.py
deleted file mode 100644
index 89bb8ec3dc7c2a1d27c82f3dc7c392becca10458..0000000000000000000000000000000000000000
--- a/tests/unit/test_topology.py
+++ /dev/null
@@ -1,222 +0,0 @@
-import pytest
-
-import torch
-import torch.distributed as dist
-
-from deepspeed.runtime.pipe.topology import PipelineParallelGrid as Grid
-from deepspeed.runtime.pipe.topology import ProcessTopology as Topo
-from deepspeed.runtime.pipe.topology import _prime_factors
-
-from .common import distributed_test
-
-
-def test_topology_2d():
-    topo = Topo(axes=['row', 'col'], dims=[2, 2])
-
-    assert topo.world_size() == 4
-
-    assert topo.get_rank(row=0, col=0) == 0
-    assert topo.get_rank(row=0, col=1) == 1
-    assert topo.get_rank(row=1, col=0) == 2
-    assert topo.get_rank(row=1, col=1) == 3
-
-    assert topo.get_axis_list(axis='row', idx=0) == [0, 1]
-    assert topo.get_axis_list(axis='row', idx=1) == [2, 3]
-    assert topo.get_axis_list(axis='col', idx=0) == [0, 2]
-    assert topo.get_axis_list(axis='col', idx=1) == [1, 3]
-
-
-def test_topology_dims():
-    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 3, 4])
-    assert topo.world_size() == 24
-    assert topo.get_dim('a') == 2
-    assert topo.get_dim('b') == 3
-    assert topo.get_dim('c') == 4
-
-
-def test_topology_match():
-    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
-    print(topo.filter_match(pipe=0, data=1))
-    assert topo.filter_match(pipe=0, data=1) == [2, 3]
-    print([topo.get_coord(r) for r in topo.filter_match(pipe=0, data=1)])
-
-
-def test_topology_rank_repr():
-    topo = Topo(axes=['a', 'b'], dims=[2, 2])
-    assert topo.get_rank_repr(rank=0) == 'a_00-b_00'
-    assert topo.get_rank_repr(rank=1) == 'a_00-b_01'
-    assert topo.get_rank_repr(rank=2) == 'a_01-b_00'
-    assert topo.get_rank_repr(rank=3) == 'a_01-b_01'
-
-    assert topo.get_rank_repr(rank=3, inner_sep='+') == 'a+01-b+01'
-    assert topo.get_rank_repr(rank=3,
-                              inner_sep='🤗',
-                              outer_sep='_JEFF_') == 'a🤗01_JEFF_b🤗01'
-
-    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
-    assert topo.get_rank_repr(rank=0) == ''
-    assert topo.get_rank_repr(rank=1) == ''
-    assert topo.get_rank_repr(rank=2) == ''
-    assert topo.get_rank_repr(rank=3) == ''
-
-    assert topo.get_rank_repr(rank=0, omit_axes=['pipe']) == 'data_00'
-    assert topo.get_rank_repr(rank=1, omit_axes=['pipe']) == 'data_01'
-    assert topo.get_rank_repr(rank=2, omit_axes=['pipe']) == 'data_00'
-    assert topo.get_rank_repr(rank=3, omit_axes=['pipe']) == 'data_01'
-
-    assert topo.get_rank_repr(rank=0, omit_axes=[]) == 'pipe_00-data_00'
-    assert topo.get_rank_repr(rank=1, omit_axes=[]) == 'pipe_00-data_01'
-    assert topo.get_rank_repr(rank=2, omit_axes=[]) == 'pipe_01-data_00'
-    assert topo.get_rank_repr(rank=3, omit_axes=[]) == 'pipe_01-data_01'
-
-    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
-    assert topo.get_rank_repr(rank=0) == 'model_00'
-    assert topo.get_rank_repr(rank=1) == 'model_01'
-    assert topo.get_rank_repr(rank=2) == 'model_00'
-    assert topo.get_rank_repr(rank=3) == 'model_01'
-    assert topo.get_rank_repr(rank=4) == 'model_00'
-    assert topo.get_rank_repr(rank=5) == 'model_01'
-    assert topo.get_rank_repr(rank=6) == 'model_00'
-    assert topo.get_rank_repr(rank=7) == 'model_01'
-
-
-def test_topology_3d():
-    topo = Topo(axes=['a', 'b', 'c'], dims=[2, 2, 2])
-
-    assert topo.get_rank(a=0, b=0, c=0) == 0
-    assert topo.get_rank(a=0, b=0, c=1) == 1
-    assert topo.get_rank(a=0, b=1, c=0) == 2
-    assert topo.get_rank(a=0, b=1, c=1) == 3
-    assert topo.get_rank(a=1, b=0, c=0) == 4
-    assert topo.get_rank(a=1, b=0, c=1) == 5
-    assert topo.get_rank(a=1, b=1, c=0) == 6
-    assert topo.get_rank(a=1, b=1, c=1) == 7
-
-    assert topo.get_axis_list('a', 0) == [0, 1, 2, 3]
-    assert topo.get_axis_list('a', 1) == [4, 5, 6, 7]
-    assert topo.get_axis_list('b', 0) == [0, 1, 4, 5]
-    assert topo.get_axis_list('b', 1) == [2, 3, 6, 7]
-    assert topo.get_axis_list('c', 0) == [0, 2, 4, 6]
-    assert topo.get_axis_list('c', 1) == [1, 3, 5, 7]
-
-    assert topo.get_coord(0) == topo.ProcessCoord(0, 0, 0)
-    assert topo.get_coord(1) == topo.ProcessCoord(0, 0, 1)
-    assert topo.get_coord(2) == topo.ProcessCoord(0, 1, 0)
-    assert topo.get_coord(3) == topo.ProcessCoord(0, 1, 1)
-    assert topo.get_coord(4) == topo.ProcessCoord(1, 0, 0)
-    assert topo.get_coord(5) == topo.ProcessCoord(1, 0, 1)
-    assert topo.get_coord(6) == topo.ProcessCoord(1, 1, 0)
-    assert topo.get_coord(7) == topo.ProcessCoord(1, 1, 1)
-
-    assert topo.filter_match(a=0) == [0, 1, 2, 3]
-    assert topo.filter_match(b=1, c=1) == [3, 7]
-    assert topo.filter_match(a=1, b=1, c=1) == [7]
-
-    # Easy access method
-    assert topo.get_coord(0).a == 0
-
-
-def test_topology_comm_list():
-    topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
-
-    assert topo.get_rank(pipe=0, data=0, model=0) == 0
-    assert topo.get_rank(pipe=0, data=0, model=1) == 1
-    assert topo.get_rank(pipe=0, data=1, model=0) == 2
-    assert topo.get_rank(pipe=0, data=1, model=1) == 3
-    assert topo.get_rank(pipe=1, data=0, model=0) == 4
-    assert topo.get_rank(pipe=1, data=0, model=1) == 5
-    assert topo.get_rank(pipe=1, data=1, model=0) == 6
-    assert topo.get_rank(pipe=1, data=1, model=1) == 7
-
-    pipe_list = [
-        [0, 4], # data=0, model=0
-        [1, 5], # data=0, model=1
-        [2, 6], # data=1, model=0
-        [3, 7], # data=1, model=1
-    ]
-    assert topo.get_axis_comm_lists('pipe') == pipe_list
-
-    data_list = [
-        [0, 2], # pipe=0, model=0
-        [1, 3], # pipe=0, model=1
-        [4, 6], # pipe=1, model=0
-        [5, 7], # pipe=1, model=1
-    ]
-    assert topo.get_axis_comm_lists('data') == data_list
-
-    model_list = [
-        [0, 1], # pipe=0, data=0
-        [2, 3], # pipe=0, data=1
-        [4, 5], # pipe=1, data=0
-        [6, 7], # pipe=1, data=1
-    ]
-    assert topo.get_axis_comm_lists('model') == model_list
-
-    # Handle nonsense. We don't want to RuntimeError because it allows us to write more
-    # generalized code for data/model/pipe parallelism
-    assert topo.get_axis_comm_lists('jeff') == []
-
-
-@distributed_test(world_size=4)
-def test_grid_pipe_data():
-    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
-    grid = Grid(topology=topo)
-
-    assert grid._is_grid_valid()
-
-    rank = dist.get_rank()
-
-    assert grid.is_first_stage == (grid.get_stage_id() == 0)
-    assert grid.is_last_stage == (
-        grid.get_stage_id() == grid.get_pipe_parallel_world_size() - 1)
-
-    # Test collectives along the pipeline parallel process groups
-    rank_tensor = torch.LongTensor(data=[rank]).cuda()
-    dist.all_reduce(rank_tensor, group=grid.get_pipe_parallel_group())
-    pipe_group = grid.pp_group
-    assert torch.all(rank_tensor == sum(pipe_group))
-
-    # Test collectives along the data parallel process groups
-    rank_tensor = torch.LongTensor(data=[rank]).cuda()
-    dist.all_reduce(rank_tensor, group=grid.get_data_parallel_group())
-    data_group = grid.dp_group
-    assert torch.all(rank_tensor == sum(data_group))
-
-
-@distributed_test(world_size=4)
-def test_stage_to_global():
-    topo = Topo(axes=['pipe', 'data'], dims=[2, 2])
-    grid = Grid(topology=topo)
-
-    assert grid._is_grid_valid()
-
-    assert grid.stage_to_global(stage_id=0, data=0) == 0
-    assert grid.stage_to_global(stage_id=0, data=1) == 1
-    assert grid.stage_to_global(stage_id=1, data=0) == 2
-    assert grid.stage_to_global(stage_id=1, data=1) == 3
-
-    me = topo.get_coord(rank=dist.get_rank())
-    if me.data == 0:
-        assert grid.stage_to_global(stage_id=0) == 0
-        assert grid.stage_to_global(stage_id=1) == 2
-    else:
-        assert grid.stage_to_global(stage_id=0) == 1
-        assert grid.stage_to_global(stage_id=1) == 3
-
-
-def test_primes():
-    """ Test prime factorizations. """
-    def _product(ps):
-        p = 1
-        for num in ps:
-            p *= num
-        return p
-
-    with pytest.raises(ValueError):
-        _prime_factors(0)
-
-    for x in range(1, 30):
-        primes = _prime_factors(x)
-        assert _product(primes) == x
-        for p in primes:
-            assert _prime_factors(p) == [p]
diff --git a/tests/unit/test_zero.py b/tests/unit/test_zero.py
deleted file mode 100644
index 4d53d4c6f77ca056b88a5f5419b65b2d8660cd53..0000000000000000000000000000000000000000
--- a/tests/unit/test_zero.py
+++ /dev/null
@@ -1,1288 +0,0 @@
-import math
-from typing import Dict, List, Set
-import pytest
-import torch.distributed as dist
-import torch
-from torch import Tensor
-from torch.nn import Linear, Module
-from torch.nn.modules.container import ModuleList
-from torch.nn.modules.loss import L1Loss
-from torch.nn.parameter import Parameter
-
-from .common import distributed_test
-from .simple_model import SimpleModel, random_dataloader, args_from_dict
-
-import deepspeed
-from deepspeed.runtime.engine import DeepSpeedEngine
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
-from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
-
-
-def run_unbalanced_gradients(model, data_loader):
-    def drop_some_gradients(model, iter):
-        odd_iteration = iter % 2
-        for i, p in enumerate(model.parameters()):
-            p.requires_grad = (i % 2) == odd_iteration
-
-    def enable_grads(model):
-        for p in model.parameters():
-            p.requires_grad = True
-
-    for i, batch in enumerate(data_loader):
-        drop_some_gradients(model, i + 1)
-        loss = model(batch[0], batch[1])
-        model.backward(loss)
-        model.step()
-        enable_grads(model)
-
-
-def dump_state_dict(model):
-    if dist.get_rank() == 0:
-        print("state_dict:")
-        for name, param in model.named_parameters():
-            print(f"{name} {param.data}")
-
-
-@pytest.mark.parametrize('zero_stage', [1, 2, 3])
-def test_zero_unbalanced_gradients(tmpdir, zero_stage):
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    hidden_dim = 4
-
-    model = SimpleModel(hidden_dim=hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_zero_unbalanced_gradients(model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        run_unbalanced_gradients(model, data_loader)
-
-    _test_zero_unbalanced_gradients(model=model, hidden_dim=hidden_dim)
-
-
-# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
-@pytest.mark.parametrize('zero_stage', [3])
-def test_zero3_repeat_forward_loop(tmpdir, zero_stage):
-
-    # force all params to be partitioned by forcing threshold=0
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage,
-            "stage3_param_persistence_threshold": 0
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    hidden_dim = 4
-
-    class AlbertLikeModel(torch.nn.Module):
-        def __init__(self, hidden_dim):
-            super().__init__()
-            self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
-            self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
-
-        def forward(self, x, y):
-            # run the same layer multiple times in a loop - to test a stack of forwards, followed by a stack of backwards
-            hidden = x
-            for i in range(3):
-                hidden = hidden + self.linear(hidden)
-            return self.cross_entropy_loss(hidden, y)
-
-    model = AlbertLikeModel(hidden_dim=hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_zero3_repeat_forward_loop(model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _test_zero3_repeat_forward_loop(model=model, hidden_dim=hidden_dim)
-
-
-# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
-# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
-@pytest.mark.parametrize('zero_stage', [2, 3])
-def test_zero_to_fp32_1_param_group(tmpdir, zero_stage):
-
-    # XXX: ideally refactor with the 2_param_group test as 75% is the same
-
-    # force all params to be partitioned by forcing threshold=0
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage,
-            "stage3_param_persistence_threshold": 0
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    @distributed_test(world_size=[2])
-    def _test_zero_to_fp32():
-        class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
-                super().__init__()
-                # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that
-                # the number of total elements is uneven:
-                # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
-                # (2) the following adds 4+1=5 elements
-                self.classifier = torch.nn.Linear(4, 1)
-                # total 48+5=53 (uneven as desired) elements
-                self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
-
-            def forward(self, x, y):
-                hidden = x
-                for l in self.ll:
-                    hidden = l(hidden)
-                return self.cross_entropy_loss(hidden, y)
-
-        hidden_dim = 3  # do not change
-
-        world_size = dist.get_world_size()
-        # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2
-        n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
-
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-        model.save_checkpoint(tmpdir)
-
-        # make sure all sides saved it
-        dist.barrier()
-
-        if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(list(
-                    model.module.parameters(recurse=True)),
-                                                   modifier_rank=None):
-                pass  # this forces gathering the model
-
-        #dump_state_dict(model)
-
-        orig_state_dict = {}
-        for name, param in model.module.named_parameters():
-            orig_state_dict[name] = param.detach().cpu()
-
-        if dist.get_rank() == 0:
-            fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
-            #dump_state_dict(fp32_model)
-
-            fp32_state_dict = fp32_model.state_dict()
-            for name in orig_state_dict.keys():
-                # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
-
-    _test_zero_to_fp32()
-
-
-@pytest.mark.parametrize('zero_stage', [2, 3])
-def test_zero_to_fp32_2_param_groups(tmpdir, zero_stage):
-
-    # TODO:
-    # - need to test with multiple param groups
-
-    # force all params to be partitioned by forcing threshold=0
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_allow_untested_optimizer": 1,
-        "zero_optimization": {
-            "stage": zero_stage,
-            "stage3_param_persistence_threshold": 0
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    @distributed_test(world_size=[2])
-    def _test_zero_to_fp32():
-        class MyModel(torch.nn.Module):
-            def __init__(self, hidden_dim, n_layers):
-                super().__init__()
-                self.ll = torch.nn.ModuleList(
-                    torch.nn.Linear(hidden_dim,
-                                    hidden_dim) for i in range(n_layers))
-                self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
-
-            def forward(self, x, y):
-                hidden = x
-                for l in self.ll:
-                    hidden = l(hidden)
-                return self.cross_entropy_loss(hidden, y)
-
-        hidden_dim = 3
-
-        world_size = dist.get_world_size()
-        n_layers = world_size * 2
-        model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers)
-
-        optim_groups = [
-            {
-                "params": [l.weight for l in model.ll],
-                "weight_decay": 0.01,
-            },
-            {
-                "params": [l.bias for l in model.ll],
-                "weight_decay": 0.0
-            },
-        ]
-        optim = torch.optim.SGD(optim_groups, lr=0.1)
-
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              optimizer=optim,
-                                              config=config_dict
-        )
-        data_loader = random_dataloader(model=model,
-                                        total_samples=16,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-
-        for i, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-        model.save_checkpoint(tmpdir)
-
-        # make sure all sides saved it
-        dist.barrier()
-
-        if zero_stage == 3:
-            with deepspeed.zero.GatheredParameters(list(
-                    model.module.parameters(recurse=True)),
-                                                   modifier_rank=None):
-                pass  # this forces gathering the model
-
-        #dump_state_dict(model)
-
-        orig_state_dict = {}
-        for name, param in model.module.named_parameters():
-            orig_state_dict[name] = param.detach().cpu()
-
-        if dist.get_rank() == 0:
-            fp32_model = load_state_dict_from_zero_checkpoint(model.module, tmpdir)
-            #dump_state_dict(fp32_model)
-
-            fp32_state_dict = fp32_model.state_dict()
-            for name in orig_state_dict.keys():
-                # float() workaround for torch<1.6
-                assert torch.allclose(orig_state_dict[name].float(),
-                                      fp32_state_dict[name].float())
-
-    _test_zero_to_fp32()
-
-
-@pytest.mark.parametrize('zero_stage, allgather_bucket_size', [(2, 1000), (2, 1001)])
-def test_incorrect_allgather_bucket_size(tmpdir, zero_stage, allgather_bucket_size):
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage,
-            "allgather_bucket_size": allgather_bucket_size
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    hidden_dim = 4
-
-    model = SimpleModel(hidden_dim=hidden_dim)
-
-    @distributed_test(world_size=[1])
-    def _test_incorrect_allgather_bucket_size(model, hidden_dim):
-        if allgather_bucket_size % 2 == 0:
-            model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-        else:
-            with pytest.raises(AssertionError) as assertinfo:
-                model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                                  model=model,
-                                                  model_parameters=model.parameters())
-            assert "allgather_bucket_size must be a multiple of nccl_start_alignment_factor" in str(
-                assertinfo)
-
-    _test_incorrect_allgather_bucket_size(model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('zero_stage, world_size', [(2, 2), (2, 3), (2, 4)])
-def test_partition_nccl_alignment(tmpdir, zero_stage, world_size):
-    config_dict = {
-        "train_micro_batch_size_per_gpu": 2,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "zero_optimization": {
-            "stage": zero_stage
-        },
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-3
-            }
-        },
-        "fp16": {
-            "enabled": True,
-            "initial_scale_power": 8
-        }
-    }
-
-    hidden_dim = 4
-
-    model = SimpleModel(hidden_dim=hidden_dim)
-
-    @distributed_test(world_size=world_size)
-    def _test_partition_nccl_alignment(model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(config=config_dict,
-                                              model=model,
-                                              model_parameters=model.parameters())
-
-        # get nccl all-gather send buffers alignment factor
-        nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor
-
-        parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups
-        for data_parallel_partitions in parallel_partitioned_bit16_groups:
-            for partition_id, partitioned_data in enumerate(data_parallel_partitions):
-                # verify that data partition start locations are 4-byte aligned
-                assert (partitioned_data.data_ptr() %
-                        (2 * nccl_start_alignment_factor) == 0)
-
-    _test_partition_nccl_alignment(model=model, hidden_dim=hidden_dim)
-
-
-def _ds_initialize_for_param_partitioning_testing(model: Module,
-                                                  cfg: dict) -> DeepSpeedEngine:
-    ds_engine, _, _, _ = deepspeed.initialize(
-        config=cfg,
-        model=model,
-        model_parameters=model.parameters()
-    )
-
-    return ds_engine
-
-
-def _assert_partition_status(model: Module,
-                             valid_statuses: Set[ZeroParamStatus]) -> None:
-    for _, param in model.named_parameters():
-        assert param.ds_status in valid_statuses, param.ds_summary()
-
-
-def _assert_fully_available(model: Module) -> None:
-    for _, param in model.named_parameters():
-        assert param.ds_status == ZeroParamStatus.AVAILABLE
-
-
-class EltwiseMultiplicationModule(Module):
-    def __init__(self, weight: Parameter) -> None:
-        super().__init__()
-        self.weight = weight
-
-    def forward(self, x: Tensor) -> Tensor:
-        _assert_fully_available(self)
-        result = self.weight * x
-
-        return result
-
-
-class EltwiseMultiplicationTestNetwork(Module):
-    """used for testing purposes"""
-    def __init__(
-        self,
-        weight1: Parameter,
-        weight2: Parameter,
-        weight3: Parameter,
-    ) -> None:
-        super().__init__()
-        self.__layer1 = EltwiseMultiplicationModule(weight1)
-        self.__layer2 = EltwiseMultiplicationModule(weight2)
-        self.__layer3 = EltwiseMultiplicationModule(weight3)
-
-        self.loss = L1Loss(reduction="none")
-
-    def forward(self, x: Tensor, y: Tensor, prefetching: bool) -> Dict[str, Tensor]:
-        _assert_partition_status(
-            self,
-            {
-                ZeroParamStatus.NOT_AVAILABLE,
-                ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if prefetching else {ZeroParamStatus.NOT_AVAILABLE})
-
-        layerwise_expected_states = {
-            ZeroParamStatus.INFLIGHT if prefetching else ZeroParamStatus.NOT_AVAILABLE,
-            ZeroParamStatus.AVAILABLE,
-        }
-
-        _assert_partition_status(self.__layer1, layerwise_expected_states)
-        hidden1 = self.__layer1(x)
-        _assert_partition_status(self.__layer1, {ZeroParamStatus.NOT_AVAILABLE})
-
-        _assert_partition_status(self.__layer2, layerwise_expected_states)
-        hidden2 = self.__layer2(hidden1)
-        _assert_partition_status(self.__layer2, {ZeroParamStatus.NOT_AVAILABLE})
-
-        _assert_partition_status(self.__layer3, layerwise_expected_states)
-        y_hat = self.__layer3(hidden2)
-        _assert_partition_status(self.__layer3,
-                                 {
-                                     ZeroParamStatus.AVAILABLE
-                                     if prefetching else ZeroParamStatus.NOT_AVAILABLE
-                                 })
-
-        loss = self.loss(y_hat, y)
-
-        _assert_partition_status(
-            self,
-            {
-                ZeroParamStatus.NOT_AVAILABLE,
-                ZeroParamStatus.INFLIGHT,
-                ZeroParamStatus.AVAILABLE
-            } if prefetching else {ZeroParamStatus.NOT_AVAILABLE})
-
-        return {
-            "hidden1": hidden1,
-            "hidden2": hidden2,
-            "y_hat": y_hat,
-            "loss": loss,
-        }
-
-
-@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
-@pytest.mark.parametrize("fp16_enabled", [True, False])
-@pytest.mark.parametrize("contiguous_gradients", [True, False])
-@pytest.mark.parametrize("offload_optimizer", [True, False])
-@pytest.mark.parametrize("zero_grad", [True, False])
-@pytest.mark.parametrize("iteration", list(range(1)))
-def test_zero3_param_partitioning_base(
-    param_persistence_threshold: int,
-    fp16_enabled: bool,
-    contiguous_gradients: bool,
-    offload_optimizer: bool,
-    zero_grad: bool,
-    iteration: int,
-) -> None:
-    @distributed_test(world_size=[2])
-    def _test_zero3_param_partitioning():
-        if offload_optimizer and not contiguous_gradients:
-            return
-
-        m = 3
-        n = 5
-        weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
-
-        cfg = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "stage3_param_persistence_threshold": param_persistence_threshold,
-                "contiguous_gradients": contiguous_gradients,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": fp16_enabled,
-                "loss_scale": 1.,
-            }
-        }
-
-        if offload_optimizer:
-            cfg["zero_optimization"]["offload_optimizer"] = {
-                "device": "cpu",
-                "pin_memory": True,
-            }
-
-        ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
-        for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
-
-        def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
-            return torch.as_tensor(vals,
-                                   dtype=dtype
-                                   or (torch.float16 if fp16_enabled else torch.float32),
-                                   device=ds_engine.device)
-
-        expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
-        ])
-        expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
-        ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
-        expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
-        ])
-
-        for train_iter in range(3):
-            activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.float16 if fp16_enabled else torch.float32,
-                             device=ds_engine.device),
-                prefetching=train_iter > 0,
-            )
-            assert torch.allclose(activations["hidden1"], expected_hidden1)
-            assert torch.allclose(activations["hidden2"], expected_hidden2)
-            assert torch.allclose(activations["y_hat"], expected_yhat)
-            assert torch.allclose(activations["loss"], expected_loss)
-
-            ds_engine.backward(activations["loss"].sum())
-
-            # check the gradients
-            grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
-            assert set(grad_partitions[0].keys()) == {0, 1, 2}
-            dloss_wrt_layer1 = grad_partitions[0][0]
-            dloss_wrt_layer2 = grad_partitions[0][1]
-            dloss_wrt_layer3 = grad_partitions[0][2]
-
-            assert dloss_wrt_layer1.dtype == torch.float
-            assert dloss_wrt_layer2.dtype == torch.float
-            assert dloss_wrt_layer3.dtype == torch.float
-
-            # layer1 = [..., 1, 2, ...]
-            # layer2 = [..., 2, 4, ...]
-            # layer3 = [..., 3, 6, ...]
-            # dloss_wrt_layer3 = hidden2
-            # dloss_wrt_layer2 = layer3 * hidden1
-            # dloss_wrt_layer1 = layer3 * layer2 * x
-
-            grad_multiplier = 1 if zero_grad else (train_iter + 1)
-            if dist.get_rank() == 0:
-                assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
-                    grad_multiplier * create_tensor([2] * 8,
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
-                    grad_multiplier * create_tensor([3 * 1] * 8,
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
-                    grad_multiplier * create_tensor([3 * 2 * 1] * 8,
-                                                    torch.float))
-            elif dist.get_rank() == 1:
-                # parameters dont split evenly across ranks so rank 1 has a zero-padded
-                # partition
-                assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
-                    grad_multiplier * create_tensor(([8] * 7) + [0],
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
-                    grad_multiplier * create_tensor(([6 * 2] * 7) + [0],
-                                                    torch.float))
-                assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
-                    grad_multiplier * create_tensor(([6 * 4 * 1] * 7) + [0],
-                                                    torch.float))
-            else:
-                raise RuntimeError("test has world size of two")
-
-            if zero_grad:
-                ds_engine.optimizer.zero_grad()
-
-        # TODO. add testing for this - for now we just call it to make sure it
-        # doesn't throw
-        ds_engine.optimizer.step()
-        # taking an optimizer step invalidates all parameters, make sure everything
-        # has been partitioned afterwards
-        _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
-        assert not math.isclose(ds_engine.optimizer._global_grad_norm, 0.0)
-
-    _test_zero3_param_partitioning()
-
-
-@pytest.mark.parametrize("world_sz", [1, 2, 4])
-@pytest.mark.parametrize("param_sz", [8100])
-@pytest.mark.parametrize("init_context_manager", [True, False])
-def test_zero3_param_partitioning_large_param(world_sz: int,
-                                              param_sz: int,
-                                              init_context_manager: bool) -> None:
-    class LargeParamModel(Module):
-        def __init__(self):
-            super().__init__()
-            self.param = Parameter(torch.zeros((param_sz, ), dtype=torch.float32))
-
-            # only do weight initialization on root rank to
-            # make sure we are broadcasting correctly from rank 0
-            if dist.get_rank() == 0:
-                partition_sz = math.ceil(self.param.numel() / dist.get_world_size())
-                offset = 0
-                for rank in range(dist.get_world_size()):
-                    with torch.no_grad():
-                        self.param[offset:offset + partition_sz].fill_(rank)
-                    offset += partition_sz
-
-        def forward(self, x: Tensor) -> Tensor:
-            return x * self.param
-
-    @distributed_test(world_size=[world_sz])
-    def _distributed_test():
-        ds_config = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-        with deepspeed.zero.Init(mem_efficient_linear=False,
-                                 enabled=init_context_manager):
-            model = LargeParamModel()
-        ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
-
-        for train_iter in range(3):  # test multiple iterations to cover prefetching
-            activation: Tensor = ds_engine(
-                torch.ones(param_sz,
-                           dtype=torch.float16,
-                           device=ds_engine.device))
-
-            partition_sz = math.ceil(param_sz / world_sz)
-            for rank_idx, start_idx in enumerate(range(0, param_sz, partition_sz)):
-                activation_from_partition = activation[start_idx:start_idx +
-                                                       partition_sz]
-                assert torch.allclose(
-                    activation_from_partition,
-                    torch.full_like(activation_from_partition,
-                                    rank_idx))
-
-            ds_engine.backward(activation.sum())
-            ds_engine.allreduce_gradients()
-
-            avgd_gradients = ds_engine.optimizer.averaged_gradients
-            assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
-            weight_gradient, = avgd_gradients[0]
-            expected_weight_gradient = (train_iter + 1) * torch.full_like(
-                weight_gradient,
-                1)
-
-            assert torch.allclose(weight_gradient, expected_weight_gradient)
-
-    _distributed_test()
-
-
-@pytest.mark.parametrize("world_sz", [1, 2, 4])
-@pytest.mark.parametrize("param_sz", [100, 1_000, 10_000])
-@pytest.mark.parametrize("n_layers", [100, 1_000])
-@pytest.mark.parametrize("init_context_manager", [True, False])
-def test_zero3_param_partitioning_many_params(world_sz: int,
-                                              param_sz: int,
-                                              n_layers: int,
-                                              init_context_manager: bool) -> None:
-    class ManyParamModel(Module):
-        def __init__(self) -> None:
-            super().__init__()
-
-            self.modulelist = ModuleList(
-                EltwiseMultiplicationModule(
-                    weight=Parameter(torch.empty((param_sz,
-                                                  ),
-                                                 dtype=torch.float32)))
-                for _ in range(n_layers))
-
-            for layer_num, module in enumerate(self.modulelist):
-                with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
-                    param: Parameter = module.weight
-                    partition_sz = math.ceil(param.numel() / dist.get_world_size())
-                    offset = 0
-                    for rank in range(dist.get_world_size()):
-                        with torch.no_grad():
-                            param[offset:offset + partition_sz].fill_(2 * layer_num *
-                                                                      rank)
-                        offset += partition_sz
-
-        def forward(self, x: Tensor) -> Tensor:
-            activations = []
-
-            for module in self.modulelist:
-                x = module(x)
-                activations.append(x)
-
-            return activations
-
-    @distributed_test(world_size=[world_sz])
-    def _distributed_test():
-        ds_cfg = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=init_context_manager):
-            model = ManyParamModel()
-
-        ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
-
-        for _ in range(3):  # test multiple iterations to cover prefetching
-            activations: List[Tensor] = ds_engine(
-                torch.ones((param_sz,
-                            ),
-                           dtype=torch.float16,
-                           device=ds_engine.device))
-            assert len(activations) == n_layers
-
-            partition_sz = math.ceil(param_sz / world_sz)
-            expected_activations = torch.empty(param_sz,
-                                               dtype=torch.float16,
-                                               device=ds_engine.device)
-            for start_idx in range(0, param_sz, partition_sz):
-                expected_activations[start_idx:start_idx +
-                                     partition_sz] = dist.get_rank()
-
-            for layer_num, activation in enumerate(activations):
-                expected_activations *= 2 * layer_num
-                assert torch.allclose(activation, expected_activations)
-
-            # TODO. finish writing this test
-            ds_engine.backward(activations[-1].sum())
-
-            avgd_gradients = ds_engine.optimizer.averaged_gradients
-            assert set(avgd_gradients.keys()) == {0}, "should only have one parameter group"
-            weight_gradients: List[Tensor] = avgd_gradients[0]
-
-            for layer_num, activation in enumerate(weight_gradients):
-                pass
-
-    _distributed_test()
-
-
-@pytest.mark.parametrize("world_sz", [1, 2, 4])
-def test_zero3_init_for_parent_weight_initialization(world_sz):
-    class ModelWhereParentInitializesChildWeights(Module):
-        def __init__(self) -> None:
-            super().__init__()
-
-            self.linear = Linear(12, 1)
-
-            self.apply(self.__init_weights)
-
-        def __init_weights(self, module):
-            if isinstance(module, Linear):
-                with torch.no_grad():
-                    module.weight.fill_(1 + dist.get_rank())
-
-    @distributed_test(world_size=[world_sz])
-    def _distributed_test():
-        ds_cfg = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "contiguous_gradients": True,
-                "overlap_comm": True,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        with deepspeed.zero.Init(config=ds_cfg,
-                                 mem_efficient_linear=False,
-                                 enabled=True):
-            model = ModelWhereParentInitializesChildWeights()
-
-        assert model.linear.weight.ds_tensor.numel() == math.ceil(12 / world_sz)
-        assert torch.allclose(model.linear.weight.ds_tensor,
-                              torch.full_like(model.linear.weight.ds_tensor,
-                                              1))
-
-    _distributed_test()
-
-
-@pytest.mark.skip(
-    reason="depends on upgraded pytorch and nccl that isn't always available")
-@pytest.mark.parametrize("param_persistence_threshold", [0, 10])
-@pytest.mark.parametrize("contiguous_gradients", [True, False])
-@pytest.mark.parametrize("offload_optimizer", [True, False])
-@pytest.mark.parametrize("zero_grad", [True])
-@pytest.mark.parametrize("iteration", list(range(1)))
-def test_zero3_param_partitioning_base_bf16(
-    param_persistence_threshold: int,
-    contiguous_gradients: bool,
-    offload_optimizer: bool,
-    zero_grad: bool,
-    iteration: int,
-) -> None:
-    @distributed_test(world_size=[2])
-    def _test_zero3_param_partitioning():
-        if offload_optimizer and not contiguous_gradients:
-            return
-
-        m = 3
-        n = 5
-        weights = [Parameter(torch.zeros((m, n), dtype=torch.float32)) for _ in range(3)]
-        model = EltwiseMultiplicationTestNetwork(*weights)
-
-        cfg = {
-            "train_micro_batch_size_per_gpu": 1,
-            "zero_optimization": {
-                "stage": 3,
-                "stage3_max_reuse_distance": 0,
-                "stage3_param_persistence_threshold": param_persistence_threshold,
-                "contiguous_gradients": contiguous_gradients,
-            },
-            "optimizer": {
-                "type": "Adam",
-                "params": {
-                    "lr": 1.
-                }
-            },
-            "bf16": {
-                "enabled": True,
-                "loss_scale": 1.,
-            }
-        }
-
-        if offload_optimizer:
-            cfg["zero_optimization"]["offload_optimizer"] = {
-                "device": "cpu",
-                "pin_memory": True,
-            }
-
-        ds_engine = _ds_initialize_for_param_partitioning_testing(model, cfg)
-        for i, weight in enumerate(weights):
-            weight.ds_tensor.data = torch.full_like(weight.ds_tensor.data,
-                                                    (i + 1) * (1 + dist.get_rank()))
-
-        def create_tensor(vals):
-            return torch.as_tensor(vals, dtype=torch.bfloat16, device=ds_engine.device)
-
-        expected_hidden1 = create_tensor([
-            [1,
-             1,
-             1,
-             1,
-             1],
-            [1,
-             1,
-             1,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             2,
-             2],
-        ])
-        expected_hidden2 = create_tensor([
-            [2,
-             2,
-             2,
-             2,
-             2],
-            [2,
-             2,
-             2,
-             8,
-             8],
-            [8,
-             8,
-             8,
-             8,
-             8],
-        ])
-        expected_yhat = create_tensor([[6,
-                                        6,
-                                        6,
-                                        6,
-                                        6],
-                                       [6,
-                                        6,
-                                        6,
-                                        48,
-                                        48],
-                                       [48,
-                                        48,
-                                        48,
-                                        48,
-                                        48]])
-        expected_loss = create_tensor([
-            [5,
-             5,
-             5,
-             5,
-             5],
-            [5,
-             5,
-             5,
-             47,
-             47],
-            [47,
-             47,
-             47,
-             47,
-             47],
-        ])
-
-        for train_iter in range(3):
-            _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
-            activations = ds_engine(
-                x=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
-                y=torch.ones((m,
-                              n),
-                             dtype=torch.bfloat16,
-                             device=ds_engine.device),
-                prefetching=train_iter > 0,
-            )
-            assert torch.allclose(activations["hidden1"], expected_hidden1)
-            assert torch.allclose(activations["hidden2"], expected_hidden2)
-            assert torch.allclose(activations["y_hat"], expected_yhat)
-            assert torch.allclose(activations["loss"], expected_loss)
-
-            ds_engine.backward(activations["loss"].sum())
-            _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
-
-            # check the gradients
-            grad_partitions = ds_engine.optimizer.get_fp32_grad_partitions()
-            assert set(grad_partitions.keys()) == {0}, f"should have one parameter group but got {len(grad_partitions)}"
-            assert set(grad_partitions[0].keys()) == {0, 1, 2}
-            dloss_wrt_layer1 = grad_partitions[0][0]
-            dloss_wrt_layer2 = grad_partitions[0][1]
-            dloss_wrt_layer3 = grad_partitions[0][2]
-
-            # layer1 = [..., 1, 2, ...]
-            # layer2 = [..., 2, 4, ...]
-            # layer3 = [..., 3, 6, ...]
-            # dloss_wrt_layer3 = hidden2
-            # dloss_wrt_layer2 = layer3 * hidden1
-            # dloss_wrt_layer1 = layer3 * layer2 * x
-
-            expected_grad_dtype = torch.float32 if offload_optimizer else torch.bfloat16
-
-            grad_multiplier = 1 if zero_grad else (train_iter + 1)
-            if dist.get_rank() == 0:
-                assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
-                    grad_multiplier * create_tensor([2] * 8).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
-                    grad_multiplier * create_tensor([3 * 1] * 8).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
-                    grad_multiplier *
-                    create_tensor([3 * 2 * 1] * 8).to(expected_grad_dtype))
-            elif dist.get_rank() == 1:
-                # parameters dont split evenly across ranks so rank 1 has a zero-padded
-                # partition
-                assert torch.allclose(
-                    dloss_wrt_layer3.cuda(),
-                    grad_multiplier *
-                    create_tensor(([8] * 7) + [0]).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer2.cuda(),
-                    grad_multiplier *
-                    create_tensor(([6 * 2] * 7) + [0]).to(expected_grad_dtype))
-                assert torch.allclose(
-                    dloss_wrt_layer1.cuda(),
-                    grad_multiplier *
-                    create_tensor(([6 * 4 * 1] * 7) + [0]).to(expected_grad_dtype))
-            else:
-                raise RuntimeError("test has world size of two")
-
-            if zero_grad:
-                ds_engine.optimizer.zero_grad()
-
-        # TODO. add testing for this - for now we just call it to make sure it
-        # doesn't throw
-        ds_engine.optimizer.step()
-        _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
-
-    _test_zero3_param_partitioning()
-
-
-def test_zero_offload_stage1():
-    config_dict = {
-        "train_batch_size": 4,
-        "gradient_accumulation_steps": 2,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-4
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 1,
-            "offload_optimizer": {
-                "device": "cpu"
-            }
-        }
-    }
-
-    hidden_dim = 10
-    model = SimpleModel(hidden_dim)
-
-    @distributed_test(world_size=[2])
-    def _go(model, hidden_dim):
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        torch.distributed.barrier()
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            model.backward(loss)
-            model.step()
-
-    _go(model=model, hidden_dim=hidden_dim)
-
-
-@pytest.mark.parametrize('return_type', [tuple, list, dict])
-def test_z3_dict_fwd(return_type):
-    config_dict = {
-        "train_batch_size": 4,
-        "steps_per_print": 1,
-        "optimizer": {
-            "type": "Adam",
-            "params": {
-                "lr": 1e-4
-            }
-        },
-        "fp16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 3
-        }
-    }
-    hidden_dim = 10
-
-    class MyModel(torch.nn.Module):
-        def __init__(self, hidden_dim):
-            super(MyModel, self).__init__()
-            self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
-            self.cel = torch.nn.CrossEntropyLoss()
-
-        def forward(self, x, y):
-            x = self.l1(x)
-            loss = self.cel(x, y)
-            if return_type == dict:
-                val = {'a': x, 'loss': loss, 'b': 1, 'c': None}
-            elif return_type == list:
-                val = [x, loss]
-            elif return_type == tuple:
-                val = (x, loss)
-            else:
-                raise NotImplementedError
-            return val
-
-    @distributed_test(world_size=[1])
-    def _go(hidden_dim):
-        with deepspeed.zero.Init():
-            model = MyModel(hidden_dim)
-
-        model, _, _, _ = deepspeed.initialize(model=model,
-                                              model_parameters=model.parameters(),
-                                              config=config_dict)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device)
-        torch.distributed.barrier()
-        for n, batch in enumerate(data_loader):
-            loss = model(batch[0], batch[1])
-            if return_type == dict:
-                loss = loss['loss']
-            else:
-                loss = loss[1]
-            model.backward(loss)
-            model.step()
-
-    _go(hidden_dim)
diff --git a/tests/unit/test_zero_context.py b/tests/unit/test_zero_context.py
deleted file mode 100644
index 66521e075ce14fd056825d6ea5208738dfe68ba6..0000000000000000000000000000000000000000
--- a/tests/unit/test_zero_context.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import os
-from types import SimpleNamespace
-
-import torch
-import pytest
-
-import deepspeed
-from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
-
-from .common import distributed_test, get_master_port
-
-
-def setup_serial_env():
-    # Setup for a serial run
-    os.environ['MASTER_ADDR'] = '127.0.0.1'
-    os.environ['MASTER_PORT'] = get_master_port()
-    os.environ['LOCAL_RANK'] = '0'
-    os.environ['RANK'] = '0'
-    os.environ['WORLD_SIZE'] = '1'
-
-
-def test_scattered_init_dist():
-    setup_serial_env()
-    assert not torch.distributed.is_initialized()
-    with deepspeed.zero.Init():
-        assert torch.distributed.is_initialized()
-
-
-@distributed_test(world_size=2)
-def test_scatter_gather():
-    with deepspeed.zero.Init():
-        l = torch.nn.Linear(6, 3)
-    assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert l.weight.shape == torch.Size(partitioned_param_data_shape)
-
-    # Ensure there is no impact outside the context
-    l2 = torch.nn.Linear(6, 3)
-    assert not hasattr(l2.weight, 'ds_status')
-    assert l2.weight.numel() == l2.in_features * l2.out_features
-
-    with deepspeed.zero.GatheredParameters(l.weight):
-        assert l.weight.ds_status == ZeroParamStatus.AVAILABLE
-        assert l.weight.numel() == l.in_features * l.out_features
-
-
-@distributed_test(world_size=2)
-def test_gather_update():
-    with deepspeed.zero.Init():
-        l = torch.nn.Linear(4, 2)
-    assert l.weight.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-    # Gather and make a change
-    with deepspeed.zero.GatheredParameters(l.weight, modifier_rank=1):
-        assert l.weight.ds_status == ZeroParamStatus.AVAILABLE
-        if torch.distributed.get_rank() == 1:
-            with torch.no_grad():
-                l.weight.zero_()
-
-    # should now be scattered again
-
-    # Now gather again and ensure the change is global
-    with deepspeed.zero.GatheredParameters(l.weight):
-        # all ranks compare
-        assert torch.equal(l.weight, torch.zeros_like(l.weight))
-
-
-config = {
-    "train_batch_size": 1,
-    "steps_per_print": 1,
-    "optimizer": {
-        "type": "Adam",
-        "params": {
-            "lr": 0.00015
-        }
-    },
-    "fp16": {
-        "enabled": True,
-        "loss_scale": 138.
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "stage3_param_persistence_threshold": 1,
-    }
-}
-
-
-def test_ext_param_getattr():
-    setup_serial_env()
-
-    class ExtLinear(torch.nn.Module):
-        def __init__(self, dim=16):
-            super().__init__()
-            self.dim = dim
-            self.linear1 = torch.nn.Linear(dim, dim)
-            self.linear2 = torch.nn.Linear(dim, dim)
-
-        def forward(self, input):
-            A = self.linear1(input)
-            B = self.linear2(A)
-
-            # external use of self.linear1.weight
-            C = torch.nn.functional.linear(B, self.linear1.weight)
-            return C.sum()
-
-    net = ExtLinear()
-
-    args = SimpleNamespace(local_rank=0)
-    engine, optim, _, _ = deepspeed.initialize(args=args,
-                                               model=net,
-                                               model_parameters=net.parameters(),
-                                               config=config)
-
-    with deepspeed.zero.GatheredParameters(net.linear1.weight):
-        assert net.linear1.weight.numel() == net.dim**2
-
-    input = torch.rand(net.dim).to(engine.device).half()
-    loss = engine(input)
-    engine.backward(loss)
-    engine.step()
-
-
-def test_scatter_halftype():
-    setup_serial_env()
-
-    with deepspeed.zero.Init():
-        l = torch.nn.Linear(10, 10)
-        assert l.weight.ds_tensor.dtype == torch.float16
-
-        y = torch.LongTensor([3, 3])
-        assert y.dtype == torch.long
-
-
-class DanglingBias(torch.nn.Linear):
-    def forward(self, *inputs):
-        out = super().forward(*inputs)
-        # return the bias to trigger a dangling external param
-        return out, self.bias
-
-
-class DataClass:
-    """Just wraps data in an object. """
-    def __init__(self, out=None, bias=None):
-        self.out = out
-        self.bias = bias
-
-
-class DanglingBiasClass(DanglingBias):
-    def forward(self, *inputs):
-        out, bias = super().forward(*inputs)
-        return DataClass(out=out, bias=bias)
-
-
-class DanglingAttention(torch.nn.Linear):
-    def __init__(self, dim=16, return_obj=False):
-        super().__init__(dim, dim)
-        self.dim = dim
-        self.return_obj = return_obj
-        if return_obj:
-            self.d_linear = DanglingBiasClass(dim, dim)
-        else:
-            self.d_linear = DanglingBias(dim, dim)
-
-    def forward(self, input):
-        out = super().forward(input)
-        if self.return_obj:
-            out_obj = self.d_linear(out)
-            assert out_obj.bias.ds_status == ZeroParamStatus.AVAILABLE
-            # forward the external param
-            return out_obj.out, out_obj.bias
-        else:
-            out, bias = self.d_linear(out)
-            assert bias.ds_status == ZeroParamStatus.AVAILABLE
-            return out, bias
-
-
-class ModelContainer(torch.nn.Module):
-    def __init__(self, dim=16, return_obj=False):
-        super().__init__()
-        self.dim = dim
-        self.linear1 = torch.nn.Linear(dim, dim)
-        self.dangler = DanglingAttention(dim, return_obj=return_obj)
-
-    def forward(self, input):
-        act1 = self.linear1(input)
-        # bias is actually dangler.d_linear1.bias
-        act2, bias = self.dangler(act1)
-        assert bias.ds_status == ZeroParamStatus.AVAILABLE
-        return (act2 + bias).sum()
-
-
-class DanglingExt(torch.nn.Module):
-    def __init__(self, dim=16):
-        super().__init__()
-        self.dim = dim
-        self.container = ModelContainer(dim)
-
-    def forward(self, input):
-        out = self.container(input)
-
-        # Make sure it's at the right level of the stack
-        assert len(self._external_params) == 0
-        assert len(self.container._external_params) == 1
-        assert len(self.container.dangler._external_params) == 0
-        return out
-
-
-def test_ext_param_return():
-    setup_serial_env()
-
-    net = DanglingExt()
-
-    args = SimpleNamespace(local_rank=0)
-    engine, optim, _, _ = deepspeed.initialize(args=args,
-                                               model=net,
-                                               model_parameters=net.parameters(),
-                                               config=config)
-
-    for _ in range(5):
-        input = torch.rand(net.dim).to(engine.device).half()
-        loss = engine(input)
-        engine.backward(loss)
-        engine.step()
-
-
-@pytest.mark.skip('WIP')
-def test_ext_param_returnobj():
-    setup_serial_env()
-    print()
-
-    net = ModelContainer(return_obj=True)
-
-    args = SimpleNamespace(local_rank=0)
-    engine, optim, _, _ = deepspeed.initialize(args=args,
-                                               model=net,
-                                               model_parameters=net.parameters(),
-                                               config=config)
-
-    for _ in range(5):
-        input = torch.rand(net.dim).to(engine.device).half()
-        loss = engine(input)
-        assert len(net._external_params) == 1
-        assert len(net.dangler._external_params) == 0
-        engine.backward(loss)
-        engine.step()
-
-
-class ModelContainerVariableOutputType(ModelContainer):
-    def __init__(self, dim=16, output_type=dict):
-        super().__init__()
-        self.output_type = output_type
-        self.dim = dim
-        self.linear1 = torch.nn.Linear(dim, dim)
-
-    def forward(self, input):
-        act1 = self.linear1(input)
-        if self.output_type is dict:
-            return {'loss': act1.sum()}
-        if self.output_type is torch.tensor:
-            return act1.sum()
-
-
-@pytest.mark.parametrize('output_type', [torch.tensor, dict, None])
-def test_stage_3_output_type(output_type):
-    setup_serial_env()
-    print()
-
-    net = ModelContainerVariableOutputType(output_type=output_type)
-
-    args = SimpleNamespace(local_rank=0)
-    engine, optim, _, _ = deepspeed.initialize(args=args,
-                                               model=net,
-                                               model_parameters=net.parameters(),
-                                               config=config)
-
-    for _ in range(1):
-        input = torch.rand(net.dim).to(engine.device).half()
-        loss = engine(input)
-        if loss is not None:
-            if isinstance(loss, dict):
-                loss = loss['loss']
-            engine.backward(loss)
-            engine.step()
-
-
-# Test that no sub-class or super-class is missed
-class ConvX(torch.nn.Conv1d):
-    def __init__(self, *args):
-        super().__init__(*args)
-        # This would not be partitioned before bugfix 5ca8167
-        self.param_in = torch.nn.Parameter(torch.FloatTensor(5).uniform_())
-
-    def forward(self, x):
-        return x
-
-
-class ConvNet(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv1 = ConvX(1, 3, 4)
-        self.param = torch.nn.Parameter(torch.FloatTensor(5).uniform_())
-
-    def forward(self, x):
-        return x
-
-
-def test_subclass_param():
-    setup_serial_env()
-    with deepspeed.zero.Init(config=config):
-        model = ConvNet()
-
-    assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert model.conv1.param_in.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-
-# test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering
-# fixed by https://github.com/microsoft/DeepSpeed/pull/1202
-class GrandPa(torch.nn.Module):
-    def __init__(self, *args):
-        super().__init__(*args)
-        self.param_grandpa = torch.nn.Parameter(torch.ones(5))
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
-
-
-class Pa(GrandPa):
-    def __init__(self, *args):
-        super().__init__(*args)
-        self.param_pa = torch.nn.Parameter(torch.ones(5))
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
-
-
-class Son(Pa):
-    def __init__(self):
-        super().__init__()
-        self.param = torch.nn.Parameter(torch.ones(5))
-        self.param.data = (self.param.data + 1).data  # test param is not yet partitioned
-        self.param_pa.data = (self.param_pa.data +
-                              1).data  # test param is not yet partitioned
-        self.param_grandpa.data = (self.param_grandpa.data +
-                                   1).data  # test param is not yet partitioned
-
-
-def test_subclass_param_init():
-    setup_serial_env()
-    with deepspeed.zero.Init(config=config):
-        model = Son().cpu()
-
-    # test that all params have been partitioned
-    assert model.param_grandpa.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert model.param_pa.ds_status == ZeroParamStatus.NOT_AVAILABLE
-    assert model.param.ds_status == ZeroParamStatus.NOT_AVAILABLE
-
-    # test that the weights manipulation during each __init__ worked in all w/o needing gathering
-    ones = torch.ones(5).half().cuda()
-    with deepspeed.zero.GatheredParameters(list(model.parameters(recurse=False))):
-        assert torch.equal(model.param, ones + 1)
-        assert torch.equal(model.param_pa, ones + 2)
-        assert torch.equal(model.param_grandpa, ones + 3)
diff --git a/tests/unit/test_zero_tiled.py b/tests/unit/test_zero_tiled.py
deleted file mode 100644
index a8b63b11d32af63db5a0962831c14944934c3d48..0000000000000000000000000000000000000000
--- a/tests/unit/test_zero_tiled.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import copy
-
-import torch
-import deepspeed
-from deepspeed.runtime.zero.tiling import TiledLinear, TiledLinearReturnBias
-
-import pytest
-
-
-@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2), (5, 5), (32, 32)])
-def test_tiled_init(in_splits, out_splits):
-    in_f = 32
-    out_f = 40
-    base = torch.nn.Linear(in_f, out_f, bias=True)
-    l = TiledLinear(in_f,
-                    out_f,
-                    bias=True,
-                    init_linear=copy.deepcopy(base),
-                    out_splits=out_splits,
-                    in_splits=in_splits)
-
-    for out_id in range(out_splits):
-        for in_id in range(in_splits):
-            local_l = l.linears[out_id][in_id]
-            assert isinstance(local_l, torch.nn.Linear)
-
-            rstart = l.out_parts[out_id]
-            rstop = l.out_parts[out_id + 1]
-            cstart = l.in_parts[in_id]
-            cstop = l.in_parts[in_id + 1]
-
-            local_out = rstop - rstart
-            local_in = cstop - cstart
-            assert local_l.weight.size()[1] == local_in, f'local[{out_id}][{in_id}].size {local_l.weight.size()}'
-            assert local_l.weight.size()[0] == local_out
-
-            test = base.weight[rstart:rstop, cstart:cstop]
-
-            assert local_l.weight.size() == test.size()
-            assert torch.equal(local_l.weight.data, test.data)
-
-            if in_id == in_splits - 1:
-                assert local_l.bias is not None
-                assert local_l.bias.size()[0] == local_out
-            else:
-                assert local_l.bias is None
-
-
-@pytest.mark.parametrize('in_splits,out_splits', [(0, 0), (33, 33)])
-def test_tiled_baddim(in_splits, out_splits):
-    dim = 32
-    with pytest.raises(RuntimeError):
-        l = TiledLinear(dim, dim, out_splits=out_splits, in_splits=in_splits)
-
-
-@pytest.mark.parametrize('bias', [False, True])
-@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
-@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
-def test_tiled_forward(in_splits, out_splits, bias, in_f, out_f):
-    base = torch.nn.Linear(in_f, out_f, bias=bias)
-    test = TiledLinear(in_f,
-                       out_f,
-                       bias=bias,
-                       init_linear=copy.deepcopy(base),
-                       out_splits=out_splits,
-                       in_splits=in_splits)
-
-    inp = torch.rand(in_f)
-
-    base_out = base(copy.deepcopy(inp))
-    test_out = test(copy.deepcopy(inp))
-
-    assert torch.allclose(base_out, test_out, rtol=1e-4)
-
-
-@pytest.mark.parametrize('bias', [False, True])
-@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
-@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
-def test_tiled_backward(in_splits, out_splits, bias, in_f, out_f):
-    base = torch.nn.Linear(in_f, out_f, bias=bias)
-    test = TiledLinear(in_f,
-                       out_f,
-                       bias=bias,
-                       init_linear=copy.deepcopy(base),
-                       out_splits=out_splits,
-                       in_splits=in_splits)
-
-    inp = torch.rand(in_f)
-
-    base_out = base(copy.deepcopy(inp))
-    test_out = test(copy.deepcopy(inp))
-    assert torch.allclose(base_out, test_out, rtol=1e-4)
-
-    base_out.sum().backward()
-    test_out.sum().backward()
-
-    # compare grads
-    for row in range(out_splits):
-        rstart = test.out_parts[row]
-        rstop = test.out_parts[row + 1]
-
-        for col in range(in_splits):
-            cstart = test.in_parts[col]
-            cstop = test.in_parts[col + 1]
-
-            local = test.linears[row][col]
-            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
-            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
-
-            if local.bias is not None:
-                base_grad = base.bias.grad[rstart:rstop]
-                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
-
-
-class LinearWrapper(torch.nn.Linear):
-    """Returns its own bias to simulate Megatron-LM's behavior.
-
-    Megatron-LM optionally delays the bias addition to fuse with a proceeding kernel.
-    """
-    def forward(self, input):
-        out = super().forward(input)
-        return out, self.bias
-
-
-@pytest.mark.parametrize('bias', [False, True])
-@pytest.mark.parametrize('in_splits,out_splits', [(1, 1), (2, 2)])
-@pytest.mark.parametrize('in_f,out_f', [(32, 32), (23, 29), (29, 23)])
-def test_tiled_returnbias_backward(in_splits, out_splits, bias, in_f, out_f):
-    base = LinearWrapper(in_f, out_f, bias=bias)
-    test = TiledLinearReturnBias(in_f,
-                                 out_f,
-                                 bias=bias,
-                                 linear_cls=LinearWrapper,
-                                 init_linear=copy.deepcopy(base),
-                                 out_splits=out_splits,
-                                 in_splits=in_splits)
-
-    inp = torch.rand(in_f)
-
-    base_out_t, base_out_b = base(copy.deepcopy(inp))
-    test_out_t, test_out_b = test(copy.deepcopy(inp))
-    assert torch.allclose(base_out_t, test_out_t, rtol=1e-4)
-    if base_out_b is None:
-        assert test_out_b is None
-        base_out_b = torch.zeros_like(base_out_t)
-        test_out_b = torch.zeros_like(test_out_t)
-    else:
-        assert test_out_b is not None
-        assert torch.allclose(base_out_b, test_out_b, rtol=1e-4)
-
-    (base_out_t + base_out_b).sum().backward()
-    (test_out_t + test_out_b).sum().backward()
-
-    # compare grads
-    for row in range(out_splits):
-        rstart = test.out_parts[row]
-        rstop = test.out_parts[row + 1]
-
-        for col in range(in_splits):
-            cstart = test.in_parts[col]
-            cstop = test.in_parts[col + 1]
-
-            local = test.linears[row][col]
-            base_grad = base.weight.grad[rstart:rstop, cstart:cstop]
-            assert torch.allclose(base_grad, local.weight.grad, rtol=1e-4)
-
-            if local.bias is not None:
-                base_grad = base.bias.grad[rstart:rstop]
-                assert torch.allclose(base_grad, local.bias.grad, rtol=1e-4)
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 2face75846d7b32df09af1d0cf15abd79d33a787..b339a08056a2800252be43fbde845566d6c34206 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -1,9 +1,35 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
 
+# DeepSpeed Team
+
+import pytest
 import torch
+import deepspeed
 from deepspeed.git_version_info import torch_info
 
 
+def skip_on_arch(min_arch=7):
+    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
+        if torch.cuda.get_device_capability()[0] < min_arch:  #ignore-cuda
+            pytest.skip(f"needs higher compute capability than {min_arch}")
+    else:
+        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
+        return
+
+
+def skip_on_cuda(valid_cuda):
+    split_version = lambda x: map(int, x.split('.')[:2])
+    if deepspeed.accelerator.get_accelerator().device_name() == 'cuda':
+        CUDA_MAJOR, CUDA_MINOR = split_version(torch_info['cuda_version'])
+        CUDA_VERSION = (CUDA_MAJOR * 10) + CUDA_MINOR
+        if valid_cuda.count(CUDA_VERSION) == 0:
+            pytest.skip(f"requires cuda versions {valid_cuda}")
+    else:
+        assert deepspeed.accelerator.get_accelerator().device_name() == 'xpu'
+        return
+
+
 def required_torch_version():
     TORCH_MAJOR = int(torch.__version__.split('.')[0])
     TORCH_MINOR = int(torch.__version__.split('.')[1])
@@ -26,10 +52,8 @@ def bf16_required_version_check(accelerator_check=True):
     else:
         accelerator_pass = True
 
-    if (TORCH_MAJOR > 1 or
-        (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and (
-            NCCL_MAJOR > 2 or
-            (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass:
+    if (TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)) and (CUDA_MAJOR >= 11) and (
+            NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)) and accelerator_pass:
         return True
     else:
         return False
diff --git a/tests/unit/utils/test_get_optim_files.py b/tests/unit/utils/test_get_optim_files.py
index ccbd9626d6e85ebbb6e5507f010531d8ec68d556..4b84ebf10040ec9e61ee93c3c359bb2f7c4640ac 100644
--- a/tests/unit/utils/test_get_optim_files.py
+++ b/tests/unit/utils/test_get_optim_files.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import os
 import pytest
diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py
index 06b391e2e3014997f0e8d5442849f9f5a7aabd34..d8f12be4f3c6f5e8f32127bbbb4983c5190adc37 100644
--- a/tests/unit/utils/test_groups.py
+++ b/tests/unit/utils/test_groups.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 from deepspeed.utils.groups import _get_expert_parallel_ranks
 
@@ -14,42 +17,22 @@ def test_get_expert_parallel_ranks():
     expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
     expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
     """
-    expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
-        world_size=16, model_parallel_size_=2, expert_parallel_size_=4
-    )
+    expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(world_size=16,
+                                                                                     model_parallel_size_=2,
+                                                                                     expert_parallel_size_=4)
     assert expert_parallel_groups == [
-        [0,
-         2,
-         4,
-         6],
-        [8,
-         10,
-         12,
-         14],
-        [1,
-         3,
-         5,
-         7],
-        [9,
-         11,
-         13,
-         15],
+        [0, 2, 4, 6],
+        [8, 10, 12, 14],
+        [1, 3, 5, 7],
+        [9, 11, 13, 15],
     ]
     assert expert_data_parallel_groups == [
-        [0,
-         8],
-        [2,
-         10],
-        [4,
-         12],
-        [6,
-         14],
-        [1,
-         9],
-        [3,
-         11],
-        [5,
-         13],
-        [7,
-         15],
+        [0, 8],
+        [2, 10],
+        [4, 12],
+        [6, 14],
+        [1, 9],
+        [3, 11],
+        [5, 13],
+        [7, 15],
     ]
diff --git a/tests/unit/utils/test_init_on_device.py b/tests/unit/utils/test_init_on_device.py
index 25d102fd05a7a281a58bebac448d785cd87e0541..5d84e9be855af3d3bf38631565264bb7a4621d5e 100644
--- a/tests/unit/utils/test_init_on_device.py
+++ b/tests/unit/utils/test_init_on_device.py
@@ -1,4 +1,7 @@
-'''Copyright The Microsoft DeepSpeed Team'''
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 
 import torch
 import pytest
@@ -14,8 +17,7 @@ class TestOnDevice(DistributedTest):
     world_size = 1
 
     def test_on_device(self, device):
-        if device == "meta" and pkg_version.parse(
-                torch.__version__) < pkg_version.parse("1.10"):
+        if device == "meta" and pkg_version.parse(torch.__version__) < pkg_version.parse("1.10"):
             pytest.skip("meta tensors only became stable after torch 1.10")
 
         with OnDevice(dtype=torch.half, device=device):
diff --git a/version.txt b/version.txt
index 100435be135a32ae8974fe4dd281c4d3a9d62e02..2003b639c40025a4216b7b765e800b872a9052cd 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.8.2
+0.9.2