push rocm deepspeed v0.3.13

eadbbe09 · 401qingkong · ab5534fc · eadbbe09 · eadbbe09 · eadbbe09
Commit eadbbe09 authored Apr 25, 2021 by 401qingkong
20 changed files
--- a/deepspeed/ops/op_builder/stochastic_transformer.py
+++ b/deepspeed/ops/op_builder/stochastic_transformer.py
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .transformer import TransformerBuilder
+
+
+class StochasticTransformerBuilder(TransformerBuilder):
+    BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
+    NAME = "stochastic_transformer"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        args.append('-D__STOCHASTIC_MODE__')
+        return args
--- a/deepspeed/ops/op_builder/transformer.py
+++ b/deepspeed/ops/op_builder/transformer.py
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+import torch
+from .builder import CUDAOpBuilder
+
+
+class TransformerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER"
+    NAME = "transformer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/transformer/ds_transformer_cuda.cpp',
+            'csrc/transformer/cublas_wrappers.cu',
+            'csrc/transformer/transform_kernels.cu',
+            'csrc/transformer/gelu_kernels.cu',
+            'csrc/transformer/dropout_kernels.cu',
+            'csrc/transformer/normalize_kernels.cu',
+            'csrc/transformer/softmax_kernels.cu',
+            'csrc/transformer/general_kernels.cu'
+        ]
+
+    def include_paths(self):
+        return ['csrc/includes']
+
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '--use_fast_math',
+            '-std=c++14',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '-U__CUDA_NO_HALF2_OPERATORS__'
+        ]
+
+        return args + self.compute_capability_args()
+
+    def cxx_args(self):
+        return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
--- a/deepspeed/ops/op_builder/utils.py
+++ b/deepspeed/ops/op_builder/utils.py
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+from .builder import OpBuilder
+
+
+class UtilsBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_UTILS"
+    NAME = "utils"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/utils/flatten_unflatten.cpp']
--- a/deepspeed/runtime/custom_collectives.py
+++ b/deepspeed/runtime/custom_collectives.py
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+
+from mpi4py import MPI
+import numpy as np
+import cupy
+
+
+def my_igather(rank, size, comm, sendbuf, recbuf, root):
+    req = []
+    if rank == root:
+        for idx in range(size):
+            if idx != rank:
+                req.append(comm.Irecv(recbuf[idx], source=idx))
+            else:
+                recbuf[rank] = sendbuf
+    else:
+        req.append(comm.Isend(sendbuf, dest=root))
+    return req
+
+
+def gather_cuda(rank,
+                world_size,
+                comm,
+                cupy_sign_list_packed,
+                cupy_recvbuf_sign,
+                cupy_worker_scale,
+                cupy_recvbuf_scale):
+    # We do in-place operations on cupy buffers so we do not return any buffers
+    requests = []
+    for idx in range(world_size):
+        req_sign = my_igather(rank,
+                              world_size,
+                              comm,
+                              cupy_sign_list_packed[idx],
+                              cupy_recvbuf_sign,
+                              root=idx)
+        requests += req_sign
+
+    for idx in range(world_size):
+        req_scale = my_igather(rank,
+                               world_size,
+                               comm,
+                               cupy_worker_scale,
+                               cupy_recvbuf_scale,
+                               root=idx)
+        requests += req_scale
+
+    MPI.Request.Waitall(requests)
+
+
+def gather_host(rank,
+                world_size,
+                comm,
+                cupy_sign_list_packed,
+                cupy_recvbuf_sign,
+                cupy_worker_scale,
+                cupy_recvbuf_scale):
+    # In-place operations are not possible for newly created cupy arrays
+    # so we need to return the new buffers
+    numpy_recvbuf_sign = np.zeros([world_size,
+                                   cupy_sign_list_packed[rank].size],
+                                  dtype=cupy_sign_list_packed[0].dtype)
+    numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+    # 1. convert from cupy to numpy
+    numpy_sign_list_packed = cupy_sign_list_packed
+
+    for idx in range(world_size):
+        numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
+
+    numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
+    numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
+
+    cupy.cuda.get_current_stream().synchronize()
+
+    # 2. use numpy buffers for communication
+    requests = []
+
+    for idx in range(world_size):
+        req_sign = my_igather(rank,
+                              world_size,
+                              comm,
+                              numpy_sign_list_packed[idx],
+                              numpy_recvbuf_sign,
+                              root=idx)
+        requests += req_sign
+
+    for idx in range(world_size):
+        req_scale = my_igather(rank,
+                               world_size,
+                               comm,
+                               numpy_worker_scale,
+                               numpy_recvbuf_scale,
+                               root=idx)
+        requests += req_scale
+
+    MPI.Request.Waitall(requests)
+
+    # 3. Convert back from numpy to cupy
+    cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
+    for idx in range(world_size):
+        cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
+
+    cupy_worker_scale = cupy.asarray(numpy_worker_scale)
+    cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
+    cupy.cuda.get_current_stream().synchronize()
+
+    return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
+
+
+def allgather_cuda(comm,
+                   cupy_server_sign_packed,
+                   cupy_recvbuf_sign_server,
+                   cupy_server_scale,
+                   cupy_recvbuf_scale_server):
+    comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
+    comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
+
+
+def allgather_host(comm,
+                   cupy_server_sign_packed,
+                   cupy_recvbuf_sign_server,
+                   cupy_server_scale,
+                   cupy_recvbuf_scale_server):
+
+    # 1. Convert cupy to numpy
+    numpy_recvbuf_sign_server = np.zeros([comm.Get_size(),
+                                          cupy_server_sign_packed.size],
+                                         dtype=cupy_server_sign_packed.dtype)
+    numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
+                                           1],
+                                          dtype=cupy_server_scale.dtype)
+
+    numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
+    numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
+    numpy_server_scale = cupy.asnumpy(cupy_server_scale)
+    numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
+    cupy.cuda.get_current_stream().synchronize()
+
+    # 2. Communicate numpy buffers
+    comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
+    comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
+    comm.Barrier()
+
+    # 3. Convert numpy back to cupy
+    cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
+    cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
+    cupy_server_scale = cupy.asarray(numpy_server_scale)
+    cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
+    cupy.cuda.get_current_stream().synchronize()
+
+    return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -3,13 +3,10 @@ Copyright 2019 The Microsoft DeepSpeed Team
 '''

 import os
-import stat
 import torch
 import warnings
 import hashlib
 import torch.distributed as dist
-from collections import OrderedDict
-from shutil import copyfile

 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
@@ -388,9 +385,6 @@ class DeepSpeedEngine(Module):
    def zero_param_persistence_threshold(self):
        return self._config.zero_config.param_persistence_threshold

-    def zero_gather_fp16_weights_on_model_save(self):
-        return self._config.zero_config.gather_fp16_weights_on_model_save
-
    def fp16_enabled(self):
        return self._config.fp16_enabled

@@ -681,12 +675,8 @@ class DeepSpeedEngine(Module):
            from deepspeed.ops.lamb import FusedLamb
            optimizer = FusedLamb(model_parameters, **optimizer_parameters)
        elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-            from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
+            from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
            optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
-            if not self.fp16_enabled():
-                logger.warning(
-                    f'Currently the convergence of 1-bit Adam is only verified under FP16'
-                )
        else:
            torch_optimizer = getattr(torch.optim, self.optimizer_name())
            optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
@@ -955,7 +945,7 @@ class DeepSpeedEngine(Module):

        Arguments:
            loss: Torch tensor on which to execute backward propagation
-            allreduce_gradients: is deprecated, ignored, and will soon be removed'
+            allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
        """

        if not allreduce_gradients:
@@ -1358,7 +1348,7 @@ class DeepSpeedEngine(Module):
        zero_ckpt_name = os.path.join(
            checkpoints_path,
            str(tag),
-            filename + '_mp_rank_{:02d}'.format(mp_rank) + '_optim_states.pt')
+            filename + '_mp_rank_{:02d}'.format(mp_rank) + 'optim_states.pt')
        return zero_ckpt_name

    def _get_zero_ckpt_name(self, checkpoints_path, tag):
@@ -1535,20 +1525,13 @@ class DeepSpeedEngine(Module):
            mp_rank=mp_rank,
            dp_world_size=self.loaded_checkpoint_dp_world_size)
        invalid_zero_ckpt_paths = []
-        for i, ckpt_name in enumerate(zero_ckpt_names):
+        for ckpt_name in zero_ckpt_names:
            if not os.path.exists(ckpt_name):
-                # transparently handle the old file pattern for optim_states
-                if 'optim_states.pt' in ckpt_name:
-                    ckpt_name_try = ckpt_name.replace("_optim_states.pt",
-                                                      "optim_states.pt")
-                    if os.path.exists(ckpt_name_try):
-                        zero_ckpt_names[i] = ckpt_name_try
-                        continue
                invalid_zero_ckpt_paths.append(ckpt_name)

        if len(invalid_zero_ckpt_paths) > 0:
            logger.warn(
-                f"The following zero checkpoints paths are missing: {invalid_zero_ckpt_paths}"
+                f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist"
            )
            return None

@@ -1690,125 +1673,8 @@ class DeepSpeedEngine(Module):
        torch.save(state, save_path)
        self._curr_save_path = None

-    def _get_param_shapes(self):
-        param_shapes = OrderedDict()
-        for name, param in self.module.named_parameters():
-            param_shapes[name] = param.ds_shape if hasattr(param,
-                                                           "ds_shape") else param.shape
-            # print(f"saving param {name} {param_shapes[name]}")
-        return param_shapes
-
-    def _copy_recovery_script(self, save_path):
-        base_dir = os.path.dirname(os.path.dirname(__file__))
-        script = "zero_to_fp32.py"
-        src = os.path.join(base_dir, "utils", script)
-        dst = os.path.join(save_path, script)
-        logger.info(f"creating recovery script {dst}")
-        copyfile(src, dst)
-        # make executable
-        os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
-
    def _save_zero_checkpoint(self, save_path, tag):
        zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
-        zero_sd = dict(
-            optimizer_state_dict=self.optimizer.state_dict(),
-            param_shapes=self._get_param_shapes(),
-        )
+        zero_sd = {'optimizer_state_dict': self.optimizer.state_dict()}
        torch.save(zero_sd, zero_checkpoint_name)
-        self._copy_recovery_script(save_path)
        logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
-
-    def _zero3_consolidated_fp16_state_dict(self):
-        """
-
-        Get a full non-partitioned state_dict with fp16 weights on cpu.
-
-        This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
-
-        1. consolidates the weights from different partitions on gpu0
-        2. works on one layer at a time to require as little gpu0 memory as possible, by
-        moving the already consolidated weights to cpu
-        3. takes care to keep the shared params shared when gradually copying the params to cpu
-
-        Returns:
-            a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
-
-        """
-        import deepspeed
-
-        if not self.zero_optimization_partition_weights():
-            raise ValueError("this function requires ZeRO-3 mode")
-
-        state_dict = OrderedDict() if torch.distributed.get_rank() == 0 else None
-        shared_weights = {}
-
-        def get_layer_state_dict(module, prefix=""):
-            # gather one layer at a time to be memory-efficient
-            with deepspeed.zero.GatheredParameters(list(
-                    module.parameters(recurse=False))):
-                if torch.distributed.get_rank() == 0:
-                    for name, param in module.named_parameters(recurse=False):
-                        if param is None:
-                            continue
-                        key = prefix + name
-                        # for shared weights we want to make sure not to unshare them when copying to cpu
-                        data_ptr_id = param.storage().data_ptr()
-                        if data_ptr_id in shared_weights:
-                            # shared weights
-                            # print(f"`{key}` is shared with `{shared_weights[data_ptr_id]}`")
-                            state_dict[key] = state_dict[shared_weights[data_ptr_id]]
-                        else:
-                            state_dict[key] = param.detach().cpu()
-                            shared_weights[data_ptr_id] = key
-                        #print(f"param {name} {param.shape}")
-                        #print(f"param {key} {param.shape} {state_dict[key].storage().data_ptr()}")
-
-                    # now buffers - not sure if need to take care of potentially shared weights here
-                    for name, buf in module.named_buffers(recurse=False):
-                        if buf is not None and name not in module._non_persistent_buffers_set:
-                            state_dict[prefix + name] = buf.detach().cpu()
-
-            for name, child in module.named_children():
-                if child is not None:
-                    get_layer_state_dict(child, prefix + name + ".")
-
-        see_memory_usage("before get_layer_state_dict", force=False)
-        get_layer_state_dict(self.module, prefix="")
-        see_memory_usage("after get_layer_state_dict", force=False)
-
-        return state_dict
-
-    def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
-        r"""Save fp16 model weights
-
-        This method saves the fp16 model weights at the desired destination.
-
-        Arguments:
-            save_dir: Required. Directory for saving the model
-            save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
-
-        Important: all processes must call this method and not just the process with rank 0. It is
-        because the processes need to work in sync to gather the weights. This method will hang
-        waiting to synchronize with other processes if it's called just for the process with rank 0.
-
-        """
-
-        path = os.path.join(save_dir, save_filename)
-
-        if self.zero_optimization_partition_weights():
-            if self.zero_gather_fp16_weights_on_model_save():
-                # consolidation is expensive in time and memory and therefore isn't a default
-                state_dict = self._zero3_consolidated_fp16_state_dict()
-            else:
-                # the model will be bogus if not consolidated so don't confuse the user by saving it
-                logger.info(
-                    f"Did not save the model {path} because `stage3_gather_fp16_weights_on_model_save` is False"
-                )
-                return
-        else:
-            state_dict = self.module.state_dict()
-
-        if torch.distributed.get_rank() == 0:
-            os.makedirs(save_dir, exist_ok=True)
-            logger.info(f"Saving model weights to {path}")
-            torch.save(state_dict, path)
--- a/deepspeed/runtime/fp16/onebit_adam.py
+++ b/deepspeed/runtime/fp16/onebit_adam.py
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import types
+import torch
+import importlib
+import numpy as np
+import time
+import cupy
+from torch.utils.dlpack import to_dlpack
+from torch.utils.dlpack import from_dlpack
+from deepspeed.utils.logging import logger
+
+from mpi4py import MPI
+from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host
+
+
+class OnebitAdam(torch.optim.Optimizer):
+    """Implements the 1-bit Adam algorithm. Currently GPU-only.
+    For usage example please see, TODO DeepSpeed Tutorial
+    It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        freeze_step (int, optional): Number of steps for warmup (uncompressed)
+            stage before we start using compressed communication. (default 100000)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
+        min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in 1-bit Adam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+        cuda_aware (boolean, required): Set True if the underlying MPI implementation
+            supports CUDA-Aware communication. (default: False)
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+    def __init__(self,
+                 params,
+                 deepspeed=None,
+                 lr=1e-3,
+                 freeze_step=100000,
+                 bias_correction=True,
+                 betas=(0.9,
+                        0.999),
+                 eps=1e-8,
+                 eps_inside_sqrt=False,
+                 weight_decay=0.,
+                 max_grad_norm=0.,
+                 amsgrad=False,
+                 cuda_aware=False):
+
+        if amsgrad:
+            raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr,
+                        bias_correction=bias_correction,
+                        betas=betas,
+                        eps=eps,
+                        weight_decay=weight_decay,
+                        max_grad_norm=max_grad_norm)
+
+        super(OnebitAdam, self).__init__(params, defaults)
+        from mpi4py import MPI
+        self.eps_mode = 0 if eps_inside_sqrt else 1
+
+        self.comm = MPI.COMM_WORLD
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.comm_time = 0.0
+        self.step_time = 0.0
+        self.ave_step = 1
+        self.bk_time = 0.0
+        self.divider = int(self.size * 8 / np.gcd(self.size, 8))
+        self.deepspeed = deepspeed
+        self.adam_freeze_key = False
+        self.initialize = False
+        self.freeze_step = freeze_step
+        self.cuda_aware = cuda_aware
+
+    def torch2cupy(self, tensor):
+        return cupy.fromDlpack(to_dlpack(tensor))
+
+    def cupy2torch(self, cupy_tensor):
+        return from_dlpack(cupy_tensor.toDlpack())
+
+    def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
+        packed_sign = cupy.packbits(cupy_bool_tensor)
+        sign_list_packed = cupy.split(packed_sign, num_chunks)
+        cupy.cuda.get_current_stream().synchronize()
+        return sign_list_packed
+
+    def Compressed_Allreduce(self,
+                             buffer_m: torch.tensor,
+                             worker_error,
+                             server_error,
+                             rank,
+                             world_size,
+                             comm,
+                             local_rank):
+
+        all_start_time = time.time()
+        original_size = buffer_m.numel()
+        cupy.cuda.Device(local_rank).use()
+
+        if torch.numel(buffer_m) != torch.numel(worker_error):
+            empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
+                                       device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+        sign_buffer_m = buffer_m.sign().add_(1).bool()
+        sign_buffer_m = sign_buffer_m.float()
+        sign_buffer_m.add_(-0.5).mul_(2.0)
+        worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
+        sign_buffer_m = None
+
+        compensated_buffer_m = buffer_m
+        compensated_buffer_m.sign_()
+        compensated_buffer_m = compensated_buffer_m.add_(1).bool()
+        cupy_worker_scale = self.torch2cupy(worker_scale)
+        cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
+        compensated_buffer_m = None
+
+        cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m,
+                                                       world_size)
+        cupy_compensated_buffer_m = None
+
+        cupy_recvbuf_sign = cupy.zeros([world_size,
+                                        cupy_sign_list_packed[rank].size],
+                                       dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 1
+        gather_start = time.time()
+        if self.cuda_aware:
+            gather_cuda(rank,
+                        world_size,
+                        comm,
+                        cupy_sign_list_packed,
+                        cupy_recvbuf_sign,
+                        cupy_worker_scale,
+                        cupy_recvbuf_scale)
+        else:
+            cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank,
+               world_size,
+               comm,
+               cupy_sign_list_packed,
+               cupy_recvbuf_sign,
+               cupy_worker_scale,
+               cupy_recvbuf_scale)
+        gather_end = time.time()
+
+        cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
+            world_size,
+            -1)
+        cupy_recvbuf_sign = None
+        unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
+        cupy_unpacked_sign = None
+        unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
+        worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
+        compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
+        unpacked_sign = None
+
+        compensated_server_m.add_(server_error)
+        server_scale = torch.norm(compensated_server_m) / np.sqrt(
+            compensated_server_m.numel())
+        sign_server_m = compensated_server_m.sign().add_(1).bool()
+        sign_server_m = sign_server_m.float()
+        sign_server_m.add_(-0.5).mul_(2.0)
+        server_error.set_(compensated_server_m - server_scale * sign_server_m)
+        sign_server_m = None
+
+        compensated_server_m.sign_()
+        compensated_server_m = compensated_server_m.add_(1).bool()
+        cupy_server_scale = self.torch2cupy(server_scale)
+        cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
+        compensated_server_m = None
+
+        cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1)
+
+        cupy_recvbuf_sign_server = cupy.zeros(
+            [world_size,
+             cupy_server_sign_packed[0].size],
+            dtype=cupy_sign_list_packed[0].dtype)
+        cupy_recvbuf_scale_server = cupy.zeros([world_size,
+                                                1],
+                                               dtype=cupy_worker_scale.dtype)
+
+        # Communication Phase 2
+        if self.cuda_aware:
+            allgather_cuda(comm,
+                           cupy_server_sign_packed[0],
+                           cupy_recvbuf_sign_server,
+                           cupy_server_scale,
+                           cupy_recvbuf_scale_server)
+        else:
+            cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm,
+                  cupy_server_sign_packed[0],
+                  cupy_recvbuf_sign_server,
+                  cupy_server_scale,
+                  cupy_recvbuf_scale_server)
+
+        cupy_server_unpacked_sign = (cupy.unpackbits(
+            cupy_recvbuf_sign_server.flatten())).reshape(world_size,
+                                                         -1)
+        cupy_recvbuf_sign_server = None
+
+        server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
+        cupy_server_unpacked_sign = None
+
+        server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
+        server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
+        buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
+
+        return buffer_m
+
+    def step(self, closure=None, grads=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+            grads (list of tensors, optional): weight gradient to use for the
+                optimizer update. If gradients have type torch.half, parameters
+                are expected to be in type torch.float. (default: None)
+            output params (list of tensors, optional): A reduced recision copy
+                of the updated weights written out in addition to the regular
+                updated weights. Have to be of same type as gradients. (default: None)
+            scale (float, optional): factor to divide gradient tensor values
+                by before applying to weights. (default: 1)
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        gather_time = 0
+        allgather_time = 0
+        all_time = 0
+
+        if self.adam_freeze_key is False:
+            v_diff_buffer = 0.0
+
+        if grads is None:
+            grads_group = [None] * len(self.param_groups)
+        # backward compatibility
+        # assuming a list/generator of parameter means single group
+        elif isinstance(grads, types.GeneratorType):
+            grads_group = [grads]
+        elif type(grads[0]) != list:
+            grads_group = [grads]
+        else:
+            grads_group = grads
+
+        for group, grads_this_group in zip(self.param_groups, grads_group):
+            if grads_this_group is None:
+                grads_this_group = [None] * len(group['params'])
+
+            bias_correction = 1 if group['bias_correction'] else 0
+
+            for p, grad in zip(group['params'], grads_this_group):
+                if p.grad is None and grad is None:
+                    continue
+                if grad is None:
+                    grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
+                    )
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                    state['tensor_size'] = torch.numel(p.data)
+                    state['corrected_tensor_size'] = state['tensor_size']
+
+                    if state['tensor_size'] % (self.size * self.divider) != 0:
+                        state['corrected_tensor_size'] += ((self.size * self.divider) -
+                                                           (state['tensor_size'] %
+                                                            (self.size * self.divider)))
+                    state['server_chunk_size'] = state[
+                        'corrected_tensor_size'] // self.size
+
+                if not self.initialize or (self.adam_freeze_key
+                                           and 'worker_error' not in state.keys()):
+                    torch.cuda.empty_cache()
+                    state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
+                                                        device=p.device)
+                    state['server_error'] = torch.zeros(state['server_chunk_size'],
+                                                        device=p.device)
+                    torch.cuda.empty_cache()
+                    self.adam_freeze_key = True
+                    if not self.initialize and torch.distributed.get_rank() == 0:
+                        print("Cupy Buffers Initialized Successfully.")
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                if self.adam_freeze_key is False:
+                    exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                    exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                    grad = None
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                else:
+                    if 'non_freeze' in group.keys() and group['non_freeze'] is True:
+                        dist.all_reduce(grad)
+                        grad.mul_(1 / dist.get_world_size())
+                        exp_avg.mul_(beta1).add(1 - beta1, grad)
+                        exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                        grad = None
+                    else:
+                        if self.initialize is True:
+                            exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                        grad = None
+
+                        if self.size > 1:
+                            exp_avg.set_(
+                                self.Compressed_Allreduce(exp_avg,
+                                                          state['worker_error'],
+                                                          state['server_error'],
+                                                          self.rank,
+                                                          self.size,
+                                                          self.comm,
+                                                          self.deepspeed.local_rank))
+                    if self.initialize:
+                        update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
+
+                if self.initialize:
+                    if group['weight_decay'] > 0.0:
+                        update += group['weight_decay'] * p.data
+                    with torch.no_grad():
+                        p.add_(-group['lr'] * update)
+
+            if not self.initialize:
+                print('Pop out errors', flush=True)
+                state.pop('worker_error')
+                state.pop('server_error')
+
+        if not self.initialize:
+            self.adam_freeze_key = False
+            self.initialize = True
+            print(
+                f"Finished the initialization step at rank {torch.distributed.get_rank()}"
+            )
+            return loss
+
+        if self.adam_freeze_key is False:
+            if state['step'] >= self.freeze_step:
+                self.adam_freeze_key = True
+                self.deepspeed.enable_backward_allreduce = False
+
+        return loss
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -8,7 +8,6 @@ Helper functions and classes from multiple sources.

 import os
 import psutil
-import gc
 from math import ceil
 from math import floor
 from bisect import bisect_left, bisect_right
@@ -552,9 +551,6 @@ def see_memory_usage(message, force=False):
    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
        return

-    # python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
-    gc.collect()
-
    # Print message except when distributed but not rank 0
    logger.info(message)
    logger.info(
@@ -568,10 +564,6 @@ def see_memory_usage(message, force=False):
    logger.info(
        f'CPU Virtual Memory:  used = {used_GB} GB, percent = {vm_stats.percent}%')

-    # get the peak memory to report correct data, so reset the counter for the next call
-    if hasattr(torch.cuda, "reset_peak_memory_stats"):  # pytorch 1.4+
-        torch.cuda.reset_peak_memory_stats()
-

 def call_to_str(base, *args, **kwargs):
    """Construct a string representation of a call.

--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -34,7 +34,6 @@ class DeepSpeedZeroConfig(DeepSpeedConfigObject):
        self.param_persistence_threshold = None
        self.max_live_parameters = None
        self.max_reuse_distance = None
-        self.gather_fp16_weights_on_model_save = None

        #Stage3 Specific Parameters
        self.prefetch_bucket_size = None
@@ -151,8 +150,3 @@ class DeepSpeedZeroConfig(DeepSpeedConfigObject):
            zero_config_dict,
            ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
            ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
-
-        self.gather_fp16_weights_on_model_save = get_scalar_param(
-            zero_config_dict,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
-            ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)
--- a/deepspeed/runtime/zero/constants.py
+++ b/deepspeed/runtime/zero/constants.py
@@ -99,10 +99,6 @@ ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
 ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000

-# gathers params for saving a model - inefficient but is required in certain situations
-ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
-ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
-
 ZERO_OPTIMIZATION_DEFAULT = {
    ZERO_OPTIMIZATION_STAGE:
    ZERO_OPTIMIZATION_STAGE_DEFAULT,
@@ -137,7 +133,5 @@ ZERO_OPTIMIZATION_DEFAULT = {
    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE:
    ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
-    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT
+    ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT
 }
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -279,9 +279,6 @@ class Init(InsertPostInitMethodToModuleSubClasses):
        For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
        parameter model with 4 nodes and 32 GPUs.

-        Important: If the fp16 weights of the model can't fit onto a single GPU memory
-        this feature must be used.
-
        .. note::
            Initializes ``torch.distributed`` if it has not already been done so.
            See :meth:`deepseed.init_distributed` for more information.

--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -883,12 +883,8 @@ class FP16_DeepSpeedZeroOptimizer(object):
        for p in params:
            if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
                param_id = self.get_param_id(p)
-                # as some model have trainable parameters but skipped in training,
-                # their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
-                # so they have no norm_for_param_grads
-                if param_id in self.norm_for_param_grads:
-                    param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item()**2
+                param_norm = self.norm_for_param_grads[param_id]
+                total_norm += param_norm.item()**2

        # Sum across all model parallel GPUs.
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])

--- a/docs/Gemfile.lock
+++ b/docs/Gemfile.lock
@@ -68,7 +68,7 @@ GEM
      jekyll-theme-time-machine (= 0.1.1)
      jekyll-titles-from-headings (= 0.5.3)
      jemoji (= 0.12.0)
-      kramdown (= 2.3.1)
+      kramdown (= 2.3.0)
      kramdown-parser-gfm (= 1.1.0)
      liquid (= 4.0.3)
      mercenary (~> 0.3)
@@ -196,7 +196,7 @@ GEM
      gemoji (~> 3.0)
      html-pipeline (~> 2.2)
      jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.1)
+    kramdown (2.3.0)
      rexml
    kramdown-parser-gfm (1.1.0)
      kramdown (~> 2.0)

--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -60,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
 | torch\_adam   | Use torch's implementation of adam instead of our fused adam implementation | false   |
 | adam\_w\_mode | Apply L2 regularization (also known as AdamW)                               | true    |

-  Another example of ***optimizer*** with 1-bit Adam
+  Another example of ***optimizer*** with 1-bit Adam specific parameters is as follows.

 ```json
 "optimizer": {
@@ -74,20 +74,11 @@ The Adam optimizer also supports the following two params keys/values in additio
      "eps": 1e-8,
      "weight_decay": 3e-7,
      "freeze_step": 400,
-      "cuda_aware": false,
-      "comm_backend_name": "nccl"
+      "cuda_aware": true
    }
  }
 ```

-The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
-
-| "params" key  | Description                                                                 | Default |
-| ------------- | --------------------------------------------------------------------------- | ------- |
-| freeze\_step   | Number of warm up steps before 1-bit compression gets applied to the communication | 100000   |
-| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication         | false    |
-| comm\_backend\_name | To indicate which backend implementation to use                               | "nccl"   |
-
 ### Scheduler Parameters

 ***scheduler***: [dictionary]

--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -10,8 +10,6 @@ benefit all your large model training jobs.

 If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).

-To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed).
-
 To help with launching Azure instances we suggest using the [Azure
 CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
 several helper scripts to get you quickly started using DeepSpeed with Azure.

--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -9,7 +9,6 @@ date: 2020-05-15

 * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
-* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.

 ## Writing DeepSpeed Models
@@ -187,8 +186,8 @@ slots available.
 The following command launches a PyTorch training job across all available nodes and GPUs
 specified in `myhostfile`:
 ```bash
-deepspeed --hostfile=myhostfile <client_entry.py> <client args> \
-  --deepspeed --deepspeed_config ds_config.json
+deepspeed <client_entry.py> <client args> \
+  --deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
 ```

 Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
@@ -265,10 +264,3 @@ not detected or passed in then DeepSpeed will query the number of GPUs on the
 local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
-
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
-which devices should be used. For example, to use only gpu1 of the current
-node, do:
-```bash
-deepspeed --include localhost:1 ...
-```
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
 ---
-title: "1-bit Adam: Up to 5x less communication volume and up to 3.4x faster training"
+title: "1-bit Adam: Up to 5x less communication volume and up to 2x faster training"
 ---

-**Note:**
-This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes include: 1) NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation. 2) Add support to momentum masks for those parameters with constant zero gradients during training. 3) Bug fixes. See details below.
-{: .notice--info}
-
-**Watch out!**
-1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
-{: .notice--warning}
-
 In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.

 To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
@@ -21,7 +13,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert

 ## 1. Overview

-### 1.1 Pre-requisites for installing DeepSpeed
+### Pre-requisites for installing DeepSpeed

 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
@@ -33,19 +25,9 @@ git submodule update --init --recursive
 cd DeepSpeedExamples/
 ```

-### 1.2 Pre-requisites for 1-bit Adam
-
-#### 1.2.1 (New in v2) NCCL-based implementation
-
-In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
-
-**Watch out!**
-This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
-{: .notice--warning}
+### Pre-requisites for 1-bit Adam

-#### 1.2.2 MPI-based implementation
-
-For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
+1-bit Adam uses advanced communication schemes that are not yet supported by PyTorch distributed and NCCL. We rely on Message Passing Interface (MPI) for these advanced communication primitives.

 We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:

@@ -61,32 +43,31 @@ An example launch command for 1-bit Adam using the `deepspeed` launcher is as fo
 deepspeed --launcher=[mvapich|openmpi] script.py
 ```

-Please note that for MPI-based implementation of 1-bit Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
+Please note that because 1-bit Adam uses MPI backend to communicate during the compression stage, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.

 Alternatively, the standard mpirun launcher can also be used as follows:

 ```shell
-mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
+mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash [training_script.sh]
 ```

-### 1.3 1-bit Algorithm
+### 1-bit Algorithm

-The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).

-### 1.4 Configuration of 1-bit Adam
+### Configuration of 1-bit Adam
 The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.

 ```json
 {
  "train_batch_size": 4096,
-  "train_micro_batch_size_per_gpu": 16,
+  "train_micro_batch_size_per_gpu": 64,
  "optimizer": {
    "type": "OneBitAdam",
    "params": {
-      "lr": 4e-4,
-      "freeze_step": 23000,
-      "cuda_aware": false,
-      "comm_backend_name": "nccl"
+      "lr": 2e-4,
+      "freeze_step": 400,
+      "cuda_aware": true
    }
  },
  "fp16": {
@@ -94,20 +75,12 @@ The 1-bit Adam feature can be used by setting the optimizer configuration option
  }
 }
 ```
-Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_name` that have been added to support the 1-bit Adam feature.
-
-`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to Adam's variance/second moment term. See detailed analysis in our [paper](https://arxiv.org/abs/2102.02888)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
+Please note two new parameters `freeze_step` and `cuda_aware` that have been added to support the 1-bit Adam feature.

-`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
+`cuda_aware` is used to indicate that the underlying MPI library support CUDA-Aware communication.
+This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.

-(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
-
-#### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
-
-**Watch out!**
-1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
-{: .notice--warning}
+`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model. If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.

 ## 2. BingBertSQuAD Fine-tuning with 1-bit Adam

@@ -120,13 +93,9 @@ Because 1-bit compression cannot represent exact zero, the compression error wou

 You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.

-**Note:** For details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
-
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam

-We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
-
-<!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
+The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
 helps to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -163,7 +132,7 @@ For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the su

 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
-``` -->
+```

 ### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled

@@ -179,16 +148,18 @@ Table 1 shows the fine-tuning configuration we used in our experiments.
 | ------------------------------ | ---------------------|
 | Total batch size               | 96    		|
 | Train micro batch size per GPU | 3     		|
-| Optimizer                      | **"OnebitAdam"**  	|
+| Optimizer                      | **OnebitAdam**  	|
 | Learning rate                  | 3e-5  		|
 | Sequence-length                | 384   		|
 | Weight-decay                   | 0.0   		|
 | Epoch count                    | 2     		|
 | **freeze_step**                | 400     	   	|
-| **comm_backend_name**          | "nccl"     		|
+| **cuda_aware**                 | True     		|

 Table 1. Fine-tuning configuration

+**Note:** For more details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
+
 ### 2.3 Performance Results for BingBertSQuAD Fine-tuning

 ***Accuracy:***
@@ -203,24 +174,19 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor

 ***Training Speed and Scalability:***

-<!-- 1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
+1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.

 ![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}

-Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
-
-Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
-
+Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.


 ## 3. BERT Pre-training with 1-bit Adam
-For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) tutorial.
+For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) post.

 ### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam

-We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
-
-<!-- The main part of training is done in `deepspeed_train.py`, which has
+The main part of training is done in `deepspeed_train.py`, which has
 already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
 are the shell scripts that help to invoke training and setup several different hyperparameters relevant
 to the training process.
@@ -252,11 +218,11 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
 For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
 ```shell
 mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
-``` -->
+```

 ### 3.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit Adam enabled

-The `deepspeed_bsz4k_onebit_config_seq128_*.json` file gives the user the ability to specify DeepSpeed
+The `deepspeed_bsz4k_onebit_config_seq128.json` file gives the user the ability to specify DeepSpeed
 options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.

 Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
@@ -274,7 +240,7 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
      "weight_decay": 0.01,
      "bias_correction": false,
      "freeze_step": 23000,
-      "comm_backend_name": "nccl"
+      "cuda_aware": true
    }
  },
  "gradient_clipping": 1.0,
@@ -285,8 +251,8 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
  }
 }
 ```
-The above file is for BERT-large. For BERT-base training (sequence length 128), the suggested `freeze_step` is 16000. For sequence 512 pre-training, we suggest to use a `freeze_step` of 1500 for both BERT-base and BERT-large. And make sure to set the `comm_backend_name` and `cuda_aware` correctly as described above.
+The above file is for BERT-large but for BERT-base training (sequence length 128), the suggested `freeze_step` will need to be changed to 16000. For the rest of the pre-training using sequence 512, we suggest to use a `freeze_step` of 1500. And make sure to set the `cuda_aware` correctly as described above.

 ### 3.3 Performance Results for BERT Pre-training

-Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
+Performance results of BERT Pre-training can be seen from our detailed [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
@@ -276,9 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim

 DeepSpeed provides a `LayerSpec` class that delays the construction of
 modules until the model layers have been partitioned across workers.
-Then each worker will allocate only the layers it's assigned to. So, comparing to the
-example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
-allocate a total of 1x model size on its CPU memory and not 16x.
+Then each worker will allocate only the layers it's assigned to. So, continuing the
+example from the previous paragraph, a machine with 16 GPUs will need to allocate a
+total of 1x model size on its CPU, compared to 16x in the LayerSpec example.

 Here is an example of the abbreviated AlexNet model, but expressed only
 with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`

--- a/docs/code-docs/source/optimizers.rst
+++ b/docs/code-docs/source/optimizers.rst
@@ -17,4 +17,4 @@ FusedLamb (GPU)

 OneBitAdam (GPU)
 ----------------------------
-.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
+.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
--- a/docs/index.md
+++ b/docs/index.md
@@ -4,8 +4,6 @@ toc: true
 toc_label: "Contents"
 ---

-<b>03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)</b>
-
 DeepSpeed is a deep learning optimization library that makes distributed training easy,
 efficient, and effective.

@@ -30,11 +28,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).

 # What's New?
-* [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
-* [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
-* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
-* [2021/01/19] [[🤗Hugging Face Blog] Fit More and Train Faster With ZeRO via DeepSpeed and FairScale](https://huggingface.co/blog/zero-deepspeed-fairscale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})

--- a/hip/MANIFEST.in
+++ b/hip/MANIFEST.in
+include *.txt README.md
+recursive-include requirements *.txt
+recursive-include deepspeed *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc