push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/module_inject/module_quantize.py
+++ b/deepspeed/module_inject/module_quantize.py
-import copy
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import torch
-import deepspeed


 def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=False):

--- a/deepspeed/module_inject/policy.py
+++ b/deepspeed/module_inject/policy.py
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from abc import ABC, abstractmethod
+from deepspeed.utils.types import ActivationFuncType
+import torch
+from deepspeed.accelerator import get_accelerator
+
+transformer_param_names = (
+        'attn_qkvw', \
+        'attn_qkvb', \
+        'attn_ow' , \
+        'attn_ob', \
+        'inter_w', \
+        'inter_b', \
+        'output_w', \
+        'output_b', \
+        'attn_nw', \
+        'attn_nb', \
+        'norm_w', \
+        'norm_b')
+
+
+class DSPolicy(ABC):
+    _orig_layer_class = None
+
+    def __init__(self):
+        self.cuda_graph_supported = False
+
+    @abstractmethod
+    def attention(self):
+        """
+        Returns attention qkv and dense parameters
+        weight: (3*hidden, hidden) and (hidden, hidden)
+        bias: (3*hidden) and (hidden)
+        """
+        raise NotImplementedError
+
+
+class TransformerPolicy(DSPolicy):
+    # a static class variable containing the HuggingFace model configuration.
+    # see e.g., transformers.models.opt.configuration_opt.OPTConfig
+    hf_model_config = None
+
+    def __init__(
+            self,
+            inference=True,
+            linear_layer=True,
+            scale_attention=True,
+            megatron_v2=False,
+            use_mup=False,
+            # the type of activation function used in MLP
+            mlp_act_func_type=ActivationFuncType.GELU,
+            # applies layer norm before attention if `pre_attn_norm` is set to True
+            pre_attn_norm=True,
+            # this flag shows whether or not using prefix in loading the checkpoint
+            use_load_prefix=False,
+            # whether or not the qkv is stored in the split-format
+            split_qkv=True):
+        super().__init__()
+        self.cuda_graph_supported = False
+        self.inference = inference
+        self.linear_layer = linear_layer
+        self.scale_attention = scale_attention
+        self.is_megatron_v2 = megatron_v2
+        self.use_mup = use_mup
+        self.mlp_act_func_type = mlp_act_func_type
+        self.pre_attn_norm = pre_attn_norm
+        self.use_load_prefix = use_load_prefix
+        self.split_qkv = split_qkv
+
+    @abstractmethod
+    def attention(self):
+        """
+        Returns attention qkv and dense parameters
+        weight: (3*hidden, hidden) and (hidden, hidden)
+        bias: (3*hidden) and (hidden)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_hidden_heads(self):
+        """
+        return hidden_size and number of heads
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def mlp(self):
+        """
+        Returns mlp intermediate and output
+        weight: (intermediate, hidden) and (hidden, intermediate)
+        bias: (intermediate) and (hidden)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def layernorm(self):
+        """
+        Returns LayerNorms used in transformer layer
+        Post-Attention and pre/post layer norm
+        gamma and beta with shape: (hidden)
+        """
+        raise NotImplementedError
+
+
+# TODO (lekurile): This function exists in base container as well, consolidate as some point
+def transpose(data):
+    with torch.no_grad():
+        data = data.contiguous()
+        data1 = data.transpose(-1, -2).reshape(-1)
+        data.reshape(-1).copy_(data1)
+        data1 = None
+    return data.reshape(data.shape[-1], data.shape[-2])
+
+
+# TODO (lekurile): This function exists in megatron feature container as well, consolidate as some point
+def _transpose(x, heads=1, mp_replace=None):
+    heads = heads // mp_replace.mp_size
+    outer_dim = -1
+    attention_head_size = x.shape[outer_dim] // heads
+    new_x_shape = x.size()[:outer_dim] + (heads, attention_head_size)
+    x_1 = x.view(*new_x_shape)
+    (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1)
+    if len(q.shape) > 2:
+        new_shape = (q.shape[0], ) + (-1, )
+        return torch.cat((q.reshape(new_shape),
+                          k.reshape(new_shape),
+                          v.reshape(new_shape)),
+                         dim=outer_dim).reshape(x.shape)
+    else:
+        return torch.cat((q.reshape(-1),
+                          k.reshape(-1),
+                          v.reshape(-1)),
+                         dim=-1).reshape(x.shape)
+
+
+# This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor.
+# Note that not all parameters are saved in one checkpoint, that's why we always need to check if they exist!
+def maybe_copy(module,
+               sd,
+               weight_quantizer,
+               mp_replace,
+               dst_name,
+               src_name,
+               qkv=False,
+               megatron_v2=False,
+               split_qkv=False,
+               heads=1):
+    if src_name in sd:
+        dst = getattr(module, dst_name)
+        tmp = sd[src_name]
+        if len(dst.shape) == 1:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, tmp)
+            else:
+                dst = mp_replace.copy(dst, tmp)
+            if qkv and megatron_v2:
+                dst = torch.nn.parameter.Parameter(
+                    _transpose(dst,
+                               heads=heads,
+                               mp_replace=mp_replace).contiguous())
+        else:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
+                                                (transpose(tmp).contiguous())), int8=weight_quantizer.q_int8)
+            else:
+                if qkv and megatron_v2:
+                    tmp = _transpose(transpose(tmp),
+                                     heads=heads,
+                                     mp_replace=mp_replace).contiguous()
+                    if weight_quantizer.q_int8:
+                        tmp = transpose(tmp)
+                dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
+                                                transpose(tmp)), int8=weight_quantizer.q_int8)
+        setattr(module, dst_name, dst)
+
+
+# Extending the maybe_copy function for when the q, k, and v are in separate parameters!
+def maybe_copy_qkv(module,
+                   sd,
+                   weight_quantizer,
+                   mp_replace,
+                   dst_name,
+                   src_names,
+                   split_qkv=False):
+    if src_names[0] in sd:
+        q = sd[src_names[0]]
+        k = sd[src_names[1]]
+        v = sd[src_names[2]]
+        qkv_data = torch.cat((q, k, v), dim=0)
+        dst = getattr(module, dst_name)
+        if len(dst.shape) == 1:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, qkv_data.contiguous())
+            else:
+                dst = mp_replace.copy(dst, qkv_data)
+        else:
+            if split_qkv:
+                dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                                ((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8)
+            else:
+                dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
+                                                transpose(qkv_data)), int8=weight_quantizer.q_int8)
+        setattr(module, dst_name, dst)
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
-import copy
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
 import torch
+import tqdm
 import deepspeed
 import deepspeed.ops.transformer as transformer_inference
-from .replace_policy import HFBertLayerPolicy, HFGPT2LayerPolicy, HFGPTJLayerPolicy
-from .replace_policy import replace_policies
-from ..constants import INFERENCE_GENERIC_MODE, INFERENCE_SPECIALIZED_MODE
-from ..runtime.weight_quantizer import WeightQuantization
+from deepspeed.ops.transformer.inference.diffusers_attention import DeepSpeedDiffusersAttention
+from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
+from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
+from deepspeed.accelerator import get_accelerator
+from .replace_policy import HFGPT2LayerPolicy
+from .replace_policy import replace_policies, generic_policies
+
+from deepspeed import comm as dist
 from torch import nn

+from .layers import LinearAllreduce, LinearLayer
+from .load_checkpoint import load_model_with_checkpoint
+import time

-class LinearAllreduce(nn.Module):
-    def __init__(self, weight, bias=None, mp_group=None):
-        super(LinearAllreduce, self).__init__()
-        self.weight = weight
-        self.bias = bias
-        self.mp_group = mp_group
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight)
-        if self.mp_group is not None:
-            torch.distributed.all_reduce(output, group=self.mp_group)
-        if self.bias is not None:
-            output += self.bias
-        return output
-
-
-class LinearLayer(nn.Module):
-    def __init__(self, weight, bias=None):
-        super(LinearLayer, self).__init__()
-        self.weight = weight
-        self.bias = bias
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight)
-        if self.bias is not None:
-            output += self.bias
-        return output
+from .utils import policy_to_ds_container


 class ReplaceWithTensorSlicing:
-    def __init__(self, mp_group=None):
+    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
        if mp_group is not None:
-            self.gpu_index = torch.distributed.get_rank(group=mp_group)
+            self.gpu_index = dist.get_rank(group=mp_group)
        else:
            self.gpu_index = 0
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.mp_size = mp_size

    def merge_assert(self, dim1, dim2):
        assert dim1 > dim2, \
@@ -51,495 +38,415 @@ class ReplaceWithTensorSlicing:
            for merging your checkpoints before replacing the transformer layer with\
            inference-kernels'

-    def qkv_copy(self, dst, src):
+    def qkv_copy(self, dst, src, int8=False):
        if src is None:
-            return torch.nn.Parameter(src)
+            return src
        src_shape = src.shape
        dst_shape = dst.shape

-        src_split = torch.split(src.data, src.shape[-1] // 3, dim=-1)
+        outer_dim = 0 if int8 else -1
+        inner_dim = -1 if int8 else 0

+        src_split = torch.split(src.data, src.shape[outer_dim] // 3, dim=outer_dim)
        if (len(src_shape) == 2 and len(dst_shape) == 2):
-            if src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            self.merge_assert(src_shape[1], dst_shape[1])
-            qkv_size = dst_shape[1] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=1) for src_s in src_split]
-
-            weight_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=1) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+            if src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+                if hasattr(src, 'scale'):
+                    dst.scale = src.scale
+                return dst
+            if self.out_dim == 1:
+                self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                qkv_size = dst_shape[self.out_dim] // 3
+                qkv_split = [
+                    torch.split(src_s,
+                                qkv_size,
+                                dim=outer_dim) for src_s in src_split
+                ]
+
+                weight_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=outer_dim) for i in range(len(qkv_split[0]))
+                ]
+                dst = dst.reshape(-1).data.copy_(
+                    weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
+                        weight_split[self.gpu_index].shape)
+            else:
+                dst.data.copy_(src_split[self.gpu_index].to(
+                    get_accelerator().current_device_name()).contiguous())
        else:
            if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            qkv_size = dst_shape[0] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
-            bias_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=0) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                return torch.nn.parameter.Parameter(src)
+            if self.out_dim == 1:
+                qkv_size = dst_shape[0] // 3
+                qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+                bias_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=0) for i in range(len(qkv_split[0]))
+                ]
+                dst.data.copy_(bias_split[self.gpu_index].contiguous())
+            else:
+                dst.data.copy_(src_split[self.gpu_index].contiguous())

-        return torch.nn.Parameter(dst)
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst

-    def copy(self, dst, src):
+    def copy(self, dst, src, int8=False):
        if src is None:
-            return torch.nn.Parameter(src)
-
+            return src
+        assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
+        outer_dim = 0 if int8 else 1
+        inner_dim = 1 if int8 else 0
        src_shape = src.shape
        dst_shape = dst.shape
-
        if (len(src_shape) == 2 and len(dst_shape) == 2):

-            if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            if src_shape[0] != dst_shape[0]:
-                self.merge_assert(src_shape[0], dst_shape[0])
-                weight_split = torch.split(src, dst_shape[0])
+            if src_shape[inner_dim] == dst_shape[
+                    self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
            else:
-                self.merge_assert(src_shape[1], dst_shape[1])
-                weight_split = torch.split(src.data, dst_shape[1], dim=1)
-
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                if src_shape[inner_dim] != dst_shape[self.in_dim]:
+                    self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
+                    weight_split = torch.split(
+                        src,
+                        dst_shape[self.in_dim],
+                        dim=inner_dim)[self.gpu_index].contiguous()
+                else:
+                    self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                    weight_split = torch.split(
+                        src.data,
+                        dst_shape[self.out_dim],
+                        dim=outer_dim)[self.gpu_index].contiguous()
+                dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(
+                    weight_split.shape)
        else:
            if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            bias_split = torch.split(src.data, dst_shape[-1])
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
-
-        return torch.nn.Parameter(dst)
+                dst.data.copy_(src)
+            else:
+                bias_split = torch.split(src.data,
+                                         dst_shape[-1])[self.gpu_index].contiguous()
+                dst.data.copy_(bias_split)
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, 'scale'):
+            dst.scale = src.scale
+        return dst
+
+
+def get_transformer_name(replaced_module):
+    from .containers import supported_models
+    from torch.nn import ModuleList
+    transformer_name = ''
+    for n, c in replaced_module.named_children():
+        if c.__class__ in supported_models:
+            transformer_name += n + '.'
+            for name, child in c.named_children():
+                if child.__class__ is ModuleList:
+                    transformer_name += name
+                    break
+            break
+    return transformer_name
+
+
+class GroupQuantizer:
+    def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
+        self.group_size = group_size
+        self.num_bits = num_bits
+        self.q_int8 = q_int8
+
+        self.num_groups = num_groups
+
+    def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
+        if not self.q_int8 or not qkv:
+            inputs = torch.nn.Parameter(inputs, requires_grad=False)
+            inputs.scale = torch.empty(1)
+            return inputs
+        q_range = 2**self.num_bits
+        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[
+            0] // self.group_size
+        inputs = inputs.to(get_accelerator().current_device_name())
+        input_flat = inputs.reshape(num_groups, -1).contiguous()
+        input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
+        input_max = torch.max(input_flat, dim=1, keepdim=True)[0].float()
+        scale = torch.max(input_min.abs(), input_max.abs()) * 2.0 / (q_range)
+        input_flat = (input_flat / scale).round().clamp(-q_range // 2, q_range // 2 - 1)
+        inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
+        out = torch.nn.Parameter(inputs_q, requires_grad=False)
+        inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
+        input_flat = [
+            inputs_split[i].reshape(num_groups,
+                                    -1).contiguous() for i in range(2)
+        ]
+        input_min = [
+            torch.min(input_flat[i],
+                      dim=1,
+                      keepdim=True)[0].float() for i in range(2)
+        ]
+        input_max = [
+            torch.max(input_flat[i],
+                      dim=1,
+                      keepdim=True)[0].float() for i in range(2)
+        ]
+        scale1 = [
+            (torch.max(input_min[i].abs(),
+                       input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
+            for i in range(2)
+        ]
+
+        out.scale = torch.cat([scale.squeeze().unsqueeze(0),
+                               scale1[0],
+                               scale1[1]],
+                              dim=0).reshape(num_groups,
+                                             -1).contiguous()
+        return out
+
+
+def _module_match(module):
+    for policy in generic_policies:
+        policy = policy()
+        if policy.match(module):
+            return policy
+    return None
+
+
+def generic_injection(module, fp16=False, enable_cuda_graph=True):
+    def replace_attn(child, policy):
+        policy_attn = policy.attention(child)
+        if policy_attn is None:
+            return child
+        if len(policy_attn) == 5:
+            qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+        else:
+            qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+
+        config = transformer_inference.DeepSpeedInferenceConfig(
+            hidden_size=hidden_size,
+            heads=heads,
+            fp16=fp16,
+            triangular_masking=False,
+            max_out_tokens=4096,
+        )
+        attn_module = DeepSpeedDiffusersAttention(config)
+
+        def transpose(data):
+            data = data.contiguous()
+            data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
+            data = data.reshape(data.shape[-1], data.shape[-2])
+            data.to(get_accelerator().current_device_name())
+            return data
+
+        if len(policy_attn) == 5:
+            attn_module.attn_qkvw.data = transpose(qkvw.data)
+        else:
+            attn_module.attn_qkvw = None
+            attn_module.attn_qw.data = transpose(qw.data)
+            attn_module.attn_kw.data = transpose(kw.data)
+            attn_module.attn_vw.data = transpose(vw.data)
+
+        attn_module.attn_qkvb = None
+        attn_module.attn_ow.data = transpose(attn_ow.data)
+        attn_module.attn_ob.data.copy_(
+            attn_ob.data.to(get_accelerator().current_device_name()))
+        return attn_module
+
+    def replace_attn_block(child, policy):
+        config = Diffusers2DTransformerConfig()
+        return DeepSpeedDiffusersTransformerBlock(child, config)
+
+    if isinstance(module, torch.nn.Module):
+        pass
+    else:
+        if fp16 is False:
+            raise ValueError("Generic injection only supported with FP16")
+
+        try:
+            import diffusers
+            cross_attention = diffusers.models.attention.CrossAttention
+            attention_block = diffusers.models.attention.BasicTransformerBlock
+            new_policies = {
+                cross_attention: replace_attn,
+                attention_block: replace_attn_block,
+            }
+        except ImportError:
+            new_policies = {}
+
+        #replace_transformer_layer(None,
+        #                          module.text_encoder,
+        #                          training=False,
+        #                          replace_with_kernel_inject=True,
+        #                          triangular_masking=True,
+        #                          max_out_tokens=8192)
+        from ..model_implementations.transformers.clip_encoder import DSClipEncoder
+        cg_encoder = DSClipEncoder(module.text_encoder,
+                                   enable_cuda_graph=enable_cuda_graph)
+        setattr(module, 'text_encoder', cg_encoder)
+        for name in module.__dict__.keys():
+            sub_module = getattr(module, name)
+            policy = _module_match(sub_module)
+
+            if policy is not None:
+
+                def _replace_module(module, policy):
+                    for name, child in module.named_children():
+                        _replace_module(child, policy)
+                        if child.__class__ in new_policies:
+                            replaced_module = new_policies[child.__class__](child,
+                                                                            policy)
+                            setattr(module, name, replaced_module)
+
+                _replace_module(sub_module, policy)
+                new_module = policy.apply(sub_module,
+                                          enable_cuda_graph=enable_cuda_graph)
+                print(f"**** found and replaced {name} w. {type(new_module)}")
+                setattr(module, name, new_module)
+
+
+container_g = None


 def replace_transformer_layer(orig_layer_impl,
                              model,
-                              policy=None,
-                              micro_batch_size=-1,
-                              config=None,
-                              seed=-1,
-                              hidden_size=-1,
-                              num_attention_heads=-1,
-                              mp_size=1,
-                              training_mp_size=1,
-                              mp_group=None,
-                              ep_group=None,
-                              expert_mp_group=None,
-                              preln=True,
-                              fp16=True,
-                              local_rank=-1,
-                              stochastic_mode=True,
-                              training=True,
-                              quantize=False,
-                              quantize_settings=None,
-                              triangular_masking=False,
-                              return_tuple=True,
-                              replace_with_kernel_inject=False,
-                              linear_layer_setting=None,
-                              moe=False,
-                              moe_experts=1,
-                              moe_type='standard'):
+                              checkpoint_dict,
+                              config,
+                              model_config):
    """ Replace bert-style transformer layers with DeepSpeed's transformer layer
    Arguments:
        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
            e.g., transformers.modeling_bert.BertLayer.
        model (torch.nn.Module): user's nn.module representing their model
-        policy: shows the policy for mapping from the orig_layer_impl to transformer parameters when
-            replace_with_kernel_inject is set, otherwise, it provides the names of two linear layers as
-            a tuple: (attention_output projection, transformer output projection)
-        micro_batch_size (int): micro batch size per gpu used during training/eval
-        config (dict): model config containing hidden size, attention heads, etc.
-        seed (int): random seed value
-        max_seq_length (int): max sequence length for training
-        hidden_size (int): hidden dimension
-        num_attention_heads (int): number of attention heads
-        mp_size (int): model_parallelism degree
-        mp_group : model_parallel group initialized on the modeling side
-        preln (bool): does the original layer implementation do pre or post layer norm?
-        fp16 (bool): fp16 or fp32
-        local_rank (int): GPU rank (optional),
-        stochastic_mode (bool): whether to use stochastic mode
-        training (bool): specifying whether kernel-injection is done for training/inference (set to false for inference-mode injection)
-        quantize_settings (tuple): this setting shows how we can quantize a model for running it through the inference kernels.
-                It includes (quantization_scales, merge_count, mlp_extra_grouping, quantize_groups).
-        return_tuple (bool): if set, transformer layer returns a tuple as the output.
-            Note: this flag needs to be set for huggingface models.
-        replace_with_kernel_inject (bool): injection_mode, if true, kernels will be add along with configuring
-            Tensor-Parallelism
-        linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers
-            and embedding layers
-        attention_params: (list of strings) [Optional]: shows the parameters in the attention part that needs to
-            be adjusted based on the model-parallelism
+        checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
+        config: top-level DS Inference config defined in inference/config.py
+        model_config: HuggingFace model config passed from the inference/engine.py
    Returns:
        Updated nn.module with replaced transformer layers
    """
+    # defining globals as internally defined functions inherit these everywhere
+    fp16 = (config.dtype == torch.float16 or config.dtype == torch.int8)
+    quantize = (config.dtype == torch.int8)
+    # todo: Refactor later. In future, let's minimize the style used above and use config.** instead
+
+    linear_layer_setting = None
+    '''
+        linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers
+    '''
+    micro_batch_size = -1
+    seed = -1
+    local_rank = -1
+
+    mp_replace = ReplaceWithTensorSlicing(
+        mp_group=config.tensor_parallel.tp_group,
+        mp_size=config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
+
    def replace_with_policy(child,
                            policy_cls,
                            triangular_masking,
                            inference=False,
-                            preln=True,
                            layer_id=0):
-        preln = False if policy_cls is HFBertLayerPolicy else preln
-        if policy_cls is HFBertLayerPolicy:
-            policy = policy_cls(child, inference=inference, preln=preln)
-        else:
-            policy = policy_cls(child, inference=inference)
+        policy = policy_cls(child, inference=inference)
+        if not policy.cuda_graph_supported:
+            # policy says cuda graph is not supported raise an error if set
+            assert not config.enable_cuda_graph, "cuda graph is not supported with this model, please disable"

-        if inference:
-            hidden_size, num_attention_heads = policy.get_hidden_heads()
-            assert num_attention_heads % mp_size == 0,\
-                "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
-                "This is because the attention computation is partitioned evenly among the parallel GPUs."
        from deepspeed.moe.layer import MoE
        moe = False
        if hasattr(child, 'mlp') and isinstance(child.mlp, MoE):
            num_experts = child.mlp.num_experts
            moe = True

-        attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention()
-        if not moe or moe_type == 'standard':
-            mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
-        else:
-            mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b, \
-                _res_h4h_w, _res_h4h_b, _res_4hh_w, _res_4hh_b, _res_coef = policy.mlp(moe_type)
-
-        attn_nw, attn_nb, input_nw, input_nb = policy.layerNorm()
-        if quantize:
-            if policy_cls is not HFBertLayerPolicy:
-                qkvw = qkvw.to(torch.int8)
-            dense_w = dense_w.to(torch.int8)
-            _h4h_w = [moe_w1.to(torch.int8)
-                      for moe_w1 in _h4h_w] if moe else _h4h_w.to(torch.int8)
-            _4hh_w = [moe_w1.to(torch.int8)
-                      for moe_w1 in _4hh_w] if moe else _4hh_w.to(torch.int8)
-        elif fp16:
-            qkvw = qkvw.half()
-            dense_w = dense_w.half()
-            _h4h_w = [moe_w1.half() for moe_w1 in _h4h_w] if moe else _h4h_w.half()
-            _4hh_w = [moe_w1.half() for moe_w1 in _4hh_w] if moe else _4hh_w.half()
-        if quantize or fp16:
-            qkvb = qkvb if qkvb is None else qkvb.half()
-            dense_b = dense_b if dense_b is None else dense_b.half()
-            _h4h_b = [moe_b1.half() for moe_b1 in _h4h_b] if moe else _h4h_b.half()
-            _4hh_b = [moe_b1.half() for moe_b1 in _4hh_b] if moe else _4hh_b.half()
-            attn_nw = attn_nw if attn_nw is None else attn_nw.half()
-            attn_nb = attn_nb if attn_nb is None else attn_nb.half()
-            input_nw = input_nw.half()
-            input_nb = input_nb.half()
-
-        if moe and moe_type == 'residual' and fp16:
-            _res_h4h_b = _res_h4h_b.half()
-            _res_4hh_b = _res_4hh_b.half()
-            _res_h4h_w = _res_h4h_w.half()
-            _res_4hh_w = _res_4hh_w.half()
-            _res_coef = _res_coef.half()
-
-        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-        #expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group)
-
-        if inference:
-            if moe:
-                ep_world_size = torch.distributed.get_world_size()
-                local_ep_size = 1 if num_experts < ep_world_size else num_experts // ep_world_size
-
-                transformer_config = transformer_inference.DeepSpeedMoEInferenceConfig(
-                    hidden_size=hidden_size,
-                    heads=num_attention_heads,
-                    layer_norm_eps=config.layer_norm_eps if hasattr(
-                        config,
-                        'layer_norm_eps') else 1e-12,
-                    fp16=fp16,
-                    pre_layer_norm=preln,
-                    mp_size=mp_size,
-                    q_int8=quantize,
-                    moe_experts=local_ep_size,
-                    global_experts=num_experts,
-                    mlp_type=moe_type)
-            else:
-                rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \
-                                            if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1
-                transformer_config = transformer_inference.DeepSpeedInferenceConfig(
-                    hidden_size=hidden_size,
-                    heads=num_attention_heads,
-                    layer_norm_eps=config.layer_norm_eps if hasattr(
-                        config,
-                        'layer_norm_eps') else
-                    (config.layer_norm_epsilon
-                     if hasattr(config,
-                                'layer_norm_epsilon') else config.layernorm_epsilon
-                     if hasattr(config,
-                                'layernorm_epsilon') else 1.0e-12),
-                    fp16=fp16,
-                    pre_layer_norm=preln,
-                    mp_size=mp_size,
-                    q_int8=quantize,
-                    return_tuple=(return_tuple or (policy_cls is HFBertLayerPolicy)),
-                    triangular_masking=(policy_cls is not HFBertLayerPolicy),
-                    local_attention=((config.attention_layers[layer_id] == "local")
-                                     if hasattr(config,
-                                                'attention_layers') else False),
-                    window_size=(config.window_size if hasattr(config,
-                                                               'window_size') else 1),
-                    rotary_dim=rotary_dim,
-                    mlp_after_attn=(rotary_dim is None or rotary_dim < 0),
-                    training_mp_size=training_mp_size)
-
-            if quantize and quantize_settings is not None:
-                (quantization_scales,
-                 merge_count,
-                 mlp_extra_grouping,
-                 quantize_groups) = quantize_settings
-                if moe:
-                    new_module = transformer_inference.DeepSpeedMoEInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        ep_group=None if ep_group is None else ep_group[num_experts],
-                        expert_mp_group=None
-                        if expert_mp_group is None else expert_mp_group[num_experts],
-                        quantize_scales=quantization_scales[layer_id],
-                        quantize_groups=quantize_groups,
-                        merge_count=merge_count,
-                        mlp_extra_grouping=mlp_extra_grouping,
-                        qkv_merging=(policy_cls is HFBertLayerPolicy))
+        # 1. Create a model-specific container object using the policy object.
+        _container = policy_to_ds_container(policy=policy,
+                                            config=config,
+                                            model_config=model_config,
+                                            layer_id=layer_id,
+                                            child=child)
+        _container.set_dtype(fp16)
+        _container.set_moe(moe)

-                else:
-                    new_module = transformer_inference.DeepSpeedTransformerInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        quantize_scales=quantization_scales[layer_id],
-                        quantize_groups=quantize_groups,
-                        merge_count=merge_count,
-                        mlp_extra_grouping=mlp_extra_grouping,
-                        qkv_merging=(policy_cls is HFBertLayerPolicy))
-
-                if quantize and qkvw.dtype != torch.int8:
-                    quantize_bits = 8
-                    quantizer = WeightQuantization()
-                    if policy_cls is HFBertLayerPolicy:
-                        data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups * 3)
-                    else:
-                        data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups)
-                    qkvw.data.copy_(data_quantized)
-                    qkvw.data = qkvw.data.to(torch.int8)
-            else:
+        # 2. Set the tensor parallelism config
+        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size,
+                                              config.tensor_parallel.tp_group)

-                if moe:
-                    new_module = transformer_inference.DeepSpeedMoEInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                        ep_group=None if ep_group is None else ep_group[num_experts],
-                        expert_mp_group=None
-                        if expert_mp_group is None else expert_mp_group[num_experts],
-                    )
+        # 3. Initialize tensors
+        _container.initialize_tensors()

-                else:
-                    new_module = transformer_inference.DeepSpeedTransformerInference(
-                        transformer_config,
-                        mp_group=mp_group,
-                    )
-            new_module.config.scale_attention = scale_attention
-
-            # we want the weights in [input, output] shape
-            # linear layer is created with [input, output] shape
-            # transpose it here to reduce inference cost!
-            def transpose(data):
-                data.view(-1).copy_(data.transpose(-1, -2).contiguous().view(-1))
-                data = data.reshape(data.shape[-1], data.shape[-2])
-                return data
-
-            if attn_linear_layer:
-                qkvw.data = transpose(qkvw.data)
-                dense_w.data = transpose(dense_w.data)
-
-            if megatron_v2:
-                new_module.config.rotate_half = True
-                new_module.config.rotate_every_two = False
-
-                def _transpose(x):
-                    num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size
-                    attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-                    new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                                   attention_head_size)
-                    x_1 = x.view(*new_x_shape)
-                    (q,
-                     k,
-                     v) = torch.split(x_1,
-                                      (x_1.shape[-1] // 3),
-                                      dim=(x_1.dim() - 1))
-                    if len(q.shape) > 2:
-                        return torch.cat((q.reshape(q.shape[0],
-                                                    -1),
-                                          k.reshape(q.shape[0],
-                                                    -1),
-                                          v.reshape(q.shape[0],
-                                                    -1)),
-                                         dim=-1).reshape(x.shape)
-                    else:
-                        return torch.cat((q.reshape(-1),
-                                          k.reshape(-1),
-                                          v.reshape(-1)),
-                                         dim=-1).reshape(x.shape)
-
-                qkvw = torch.nn.Parameter(_transpose(qkvw).contiguous())
-                qkvb = torch.nn.Parameter(_transpose(qkvb).contiguous())
-
-            dense_b = dense_b * (transformer_config.training_mp_size /
-                                 transformer_config.mp_size)
-            _4hh_b = _4hh_b * (transformer_config.training_mp_size /
-                               transformer_config.mp_size)
-
-            if mlp_linear_layer:
-                _h4h_w = [transpose(moe_w1.data)
-                          for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
-                _4hh_w = [transpose(moe_w1.data)
-                          for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data)
-
-            if moe and moe_type == 'residual':
-                _res_h4h_w.data = transpose(_res_h4h_w.data)
-                _res_4hh_w.data = transpose(_res_4hh_w.data)
-                _res_coef.data = transpose(_res_coef.data)
-
-            attn_block = new_module.attention
-            attn_block.attn_qkvw = mp_replace.qkv_copy(attn_block.attn_qkvw, qkvw)
-            attn_block.attn_qkvb = mp_replace.qkv_copy(attn_block.attn_qkvb, qkvb)
-
-            attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
-            attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
-
-            mpl_block = new_module.mlp
-            if moe:
-                gpu_index = torch.distributed.get_rank()
-                gpu_index = 0
-                for ep_index in range(local_ep_size):
-                    mpl_block[ep_index].inter_w.data = _h4h_w[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].inter_b.data = _h4h_b[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].output_w.data = _4hh_w[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                    mpl_block[ep_index].output_b.data = _4hh_b[
-                        gpu_index * local_ep_size + ep_index].to(
-                            torch.cuda.current_device())
-                new_module.attn_nw.data = attn_nw.to(torch.cuda.current_device())
-                new_module.attn_nb.data = attn_nb.to(torch.cuda.current_device())
-                if moe_type == 'residual':
-                    new_module.res_mlp.inter_w.data = _res_h4h_w.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.inter_b.data = _res_h4h_b.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.output_w.data = _res_4hh_w.to(
-                        torch.cuda.current_device())
-                    new_module.res_mlp.output_b.data = _res_4hh_b.to(
-                        torch.cuda.current_device())
-                    new_module.res_coef.data = _res_coef.to(torch.cuda.current_device())
-            else:
-                mpl_block.inter_w.data = mp_replace.copy(mpl_block.inter_w, _h4h_w)
-                mpl_block.inter_b.data = mp_replace.copy(mpl_block.inter_b, _h4h_b)
-                mpl_block.output_w.data = mp_replace.copy(mpl_block.output_w, _4hh_w)
-                mpl_block.output_b.data = mp_replace.copy(mpl_block.output_b, _4hh_b)
-                if attn_nw is None:
-                    new_module.mlp.attn_nw = attn_nw
-                else:
-                    new_module.mlp.attn_nw.data = attn_nw.to(torch.cuda.current_device())
-                if attn_nb is None:
-                    new_module.mlp.attn_nb = attn_nb
-                else:
-                    new_module.mlp.attn_nb.data = attn_nb.to(torch.cuda.current_device())
-            new_module.norm_w.data = input_nw.to(torch.cuda.current_device())
-            new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
-        else:
-            transformer_config = deepspeed.DeepSpeedTransformerConfig(
-                batch_size=micro_batch_size,
-                hidden_size=config.hidden_size,
-                heads=config.num_attention_heads,
-                attn_dropout_ratio=config.attention_probs_dropout_prob,
-                hidden_dropout_ratio=config.hidden_dropout_prob,
-                num_hidden_layers=config.num_hidden_layers,
-                initializer_range=config.initializer_range,
-                layer_norm_eps=config.layer_norm_eps if hasattr(
-                    config,
-                    'layer_norm_eps') else 1e-12,
-                seed=seed,
-                fp16=fp16,
-                pre_layer_norm=(False if policy_cls is HFBertLayerPolicy else preln),
-                return_tuple=return_tuple,
-                local_rank=local_rank,
-                stochastic_mode=stochastic_mode,
-                normalize_invertible=True,
-                training=training)
-            new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
-            new_module.attn_qkvw.data = qkvw
-            new_module.attn_qkvb.data = qkvb
-            new_module.attn_ow.data = dense_w
-            new_module.attn_ob.data = dense_b
-
-            new_module.attn_nw.data = attn_nw
-            new_module.attn_nb.data = attn_nb
-            new_module.norm_w.data = input_nw
-            new_module.norm_b.data = input_nb
-
-            new_module.inter_w.data = _h4h_w
-            new_module.inter_b.data = _h4h_b
-            new_module.output_w.data = _4hh_w
-            new_module.output_b.data = _4hh_b
-        return new_module
+        # 4. deal with data types -- needs refactor to use dtype instead of fp16
+        if fp16:
+            _container.convert_to_required_dtype(dtype=torch.half)
+
+        # 5. Set the quantization config
+        quantizer = GroupQuantizer(q_int8=quantize)
+        _container.set_quantization_config(quantize, quantizer)
+
+        # 6. create a DS Inference config object
+        _container.create_ds_model_config()
+
+        # 7. use the config and create the module
+        _container.create_module()
+
+        # 8. transpose the weights and bias if needed
+        _container.transpose()
+
+        # 9. deal with tensor parallelism.
+        _container.apply_tensor_parallelism(mp_replace)
+
+        # 10. copy the tensors from the model-specific container to the new module
+        _container.copy_data_to_new_module()
+
+        # 11. set global for generic checkpoint loading
+        global container_g
+
+        if container_g is None:
+            container_g = _container
+
+        return _container.module

    def replace_wo_policy(module, all_reduce_linears):
+        mp_size = config.tensor_parallel.tp_size
+        mp_group = config.tensor_parallel.tp_group
+
        def _replace(child, name, conv_linear_layer):
            mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+            weight_shape = child.weight.shape
            if name in all_reduce_linears:
-                new_weight = torch.empty(
-                    (child.weight.shape[0]
-                     if conv_linear_layer else child.weight.shape[1] // mp_size,
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0]),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
-                data = mp_replace.copy(new_weight,
-                                       child.weight.data).to(torch.cuda.current_device())
+                new_weight = torch.empty((
+                    weight_shape[1] if conv_linear_layer else weight_shape[0],
+                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) //
+                    mp_size,
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if conv_linear_layer:
+                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+                data = mp_replace.copy(new_weight, child.weight.data)
+                new_bias = torch.empty((weight_shape[0]),
+                                       device=child.weight.device,
+                                       dtype=child.weight.dtype)
+                if child.bias is not None:
+                    new_bias.data.copy_(child.bias.data)
                return LinearAllreduce(data, child.bias if child.bias is None else \
-                            child.bias.to(torch.cuda.current_device()), mp_group)
+                            torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group)
            else:
-                new_weight = torch.empty(
-                    (child.weight.shape[0] //
-                     mp_size if conv_linear_layer else child.weight.shape[1],
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0] // mp_size),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
+                new_weight = torch.empty((
+                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) //
+                    mp_size,
+                    weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if conv_linear_layer:
+                    child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
                data = mp_replace.copy(new_weight, child.weight.data)
-                new_bias = torch.empty((child.weight.shape[1] // mp_size),
+
+                new_bias = torch.empty((weight_shape[0] // mp_size),
                                       device=child.weight.device,
-                                       dtype=torch.half if fp16 else torch.float)
+                                       dtype=child.weight.dtype)
                bias_data = None if child.bias is None else mp_replace.copy(
                    new_bias,
-                    child.bias.data).to(torch.cuda.current_device())
-                return LinearLayer(data.to(torch.cuda.current_device()), bias_data)
+                    child.bias.data).to(get_accelerator().current_device_name())
+                return LinearLayer(weight=data.to(
+                    get_accelerator().current_device_name()),
+                                   bias=bias_data)

        def _slice_embedding(child, name, conv_linear_layer):
            mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
@@ -547,7 +454,9 @@ def replace_transformer_layer(orig_layer_impl,
                                      child.weight.shape[1] // mp_size),
                                     device=child.weight.device,
                                     dtype=child.weight.dtype)
-            data = mp_replace.copy(new_weight, child.weight.data)
+            data = mp_replace.copy(new_weight,
+                                   child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \
+                                   child.weight.data)
            new_embedding = nn.Embedding(child.weight.shape[0],
                                         child.weight.shape[1] // mp_size)
            new_embedding.weight.data.copy_(data)
@@ -562,10 +471,14 @@ def replace_transformer_layer(orig_layer_impl,
                child.num_heads = child.num_heads // mp_size
            if hasattr(child, 'num_attention_heads'):
                child.num_attention_heads = child.num_attention_heads // mp_size
+            if hasattr(child, 'num_attn_heads'):
+                child.num_attn_heads = child.num_attn_heads // mp_size
            if hasattr(child, 'all_head_size'):
                child.all_head_size = child.all_head_size // mp_size
            if hasattr(child, 'embed_dim'):
                child.embed_dim = child.embed_dim // mp_size
+            if hasattr(child, 'hidden_size'):
+                child.hidden_size = child.hidden_size // mp_size

        conv_linear_layer = False
        if linear_layer_setting is not None:
@@ -600,32 +513,193 @@ def replace_transformer_layer(orig_layer_impl,
        return _replace_module(module)

    def replace_fn(child, _policy, layer_id=0):
+        training = False  # todo: refactor this part to go in the config
        if training:
            # copy relevant state from child -> new module
-            new_module = replace_with_policy(child,
-                                             _policy,
-                                             triangular_masking,
-                                             preln=preln)
+            new_module = replace_with_policy(child, _policy, config.triangular_masking)

        else:
            # copy relevant state from child -> new module
-            if replace_with_kernel_inject:
+            if config.replace_with_kernel_inject:
                new_module = replace_with_policy(child,
                                                 _policy,
-                                                 triangular_masking,
+                                                 config.triangular_masking,
                                                 inference=True,
-                                                 preln=(_policy
-                                                        is not HFBertLayerPolicy),
                                                 layer_id=layer_id)
            else:
                new_module = replace_wo_policy(child, _policy)

        return new_module

-    return replace_module(model=model,
-                          orig_class=orig_layer_impl,
-                          replace_fn=replace_fn,
-                          _replace_policy=policy)
+    replaced_module = replace_module(model=model,
+                                     orig_class=orig_layer_impl,
+                                     replace_fn=replace_fn,
+                                     _replace_policy=config.injection_policy_tuple)
+
+    quantizer = GroupQuantizer(q_int8=quantize)
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    if checkpoint_dict is not None:
+        assert container_g.ckpt_load_enabled, \
+               f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
+        start_time = time.time()
+        checkpoint = checkpoint_dict['checkpoints']
+        ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
+        ckpt_type = checkpoint_dict.get('parallelization', 'pp')
+        ckpt_mp_size = checkpoint_dict.get('tp_size', len(ckpt_list))
+        ckpt_mp_size = checkpoint_dict.get('mp_size', ckpt_mp_size)
+        base_dir1 = checkpoint_dict.get('base_dir', config.base_dir)
+
+        if ckpt_type == 'pp' and type(checkpoint) is list:
+            pbar = tqdm.tqdm(total=len(checkpoint),
+                             desc=f"Loading {len(checkpoint)} checkpoint shards")
+
+            for i in range(len(checkpoint)):
+                sd = [
+                    torch.load(os.path.join(base_dir1,
+                                            checkpoint[i]),
+                               map_location='cpu')
+                ]
+                load_model_with_checkpoint(replaced_module,
+                                           sd,
+                                           mp_replace,
+                                           ckpt_type,
+                                           ckpt_mp_size,
+                                           quantizer,
+                                           container=container_g)
+                pbar.update(1)
+        else:
+            import gc
+            num_checkpoints = len(ckpt_list) // ckpt_mp_size
+            tp_split_size = (world_size / ckpt_mp_size)
+            sd_offset = int(rank / tp_split_size)
+            sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
+            pbar = tqdm.tqdm(total=num_checkpoints,
+                             desc=f"Loading {num_checkpoints} checkpoint shards")
+            for i in range(num_checkpoints):
+                pbar.update(1)
+                ckpt_index = i * ckpt_mp_size + sd_offset
+                ckpt_files = [
+                    os.path.join(base_dir1,
+                                 ckpt_list[ckpt_index +
+                                           j]) if base_dir1 else ckpt_list[ckpt_index +
+                                                                           j]
+                    for j in range(sd_count)
+                ]
+                sds = [
+                    torch.load(ckpt_file,
+                               map_location='cpu') for ckpt_file in ckpt_files
+                ]
+                load_model_with_checkpoint(replaced_module,
+                                           sds,
+                                           mp_replace,
+                                           ckpt_type,
+                                           ckpt_mp_size,
+                                           quantizer,
+                                           int(rank % tp_split_size),
+                                           container=container_g)
+                sds = [None for _ in sds]
+                gc.collect()
+
+            if "non_tp" in checkpoint:
+                pbar = tqdm.tqdm(
+                    total=len(checkpoint["non_tp"]),
+                    desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
+
+                for i in range(len(checkpoint["non_tp"])):
+                    pbar.update(1)
+                    ckpt_file = os.path.join(base_dir1,
+                                             checkpoint["non_tp"][i]
+                                             ) if base_dir1 else checkpoint["non_tp"][i]
+                    sds = [torch.load(ckpt_file, map_location='cpu')]
+                    load_model_with_checkpoint(replaced_module,
+                                               sds,
+                                               mp_replace,
+                                               ckpt_type,
+                                               ckpt_mp_size,
+                                               quantizer,
+                                               int(rank % tp_split_size),
+                                               container=container_g)
+                    sds = [None for _ in sds]
+                    gc.collect()
+        print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
+
+    if config.save_mp_checkpoint_path is not None:
+        from collections import OrderedDict
+        import json
+        num_partitions = 8
+
+        if checkpoint_dict is None:
+            ckpt_name = "ds_model"
+            try:
+                from transformers.models.bloom.modeling_bloom import BloomForCausalLM
+                if isinstance(model, BloomForCausalLM):
+                    ckpt_name = "bloom"
+            except ImportError:
+                ckpt_name = "ds_model"
+        else:
+            ckpt_name = checkpoint_dict['type']
+        if dist.is_initialized():
+            dist.barrier()
+        transformer_name = get_transformer_name(replaced_module)
+        non_tp_ckpt_name = f'non-tp.pt'
+        ckpt_files = [non_tp_ckpt_name]
+        os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            print("Saving tp-sharded checkpoints")
+            torch.save(
+                OrderedDict({
+                    k: v
+                    for k,
+                    v in dict(replaced_module.state_dict()).items()
+                    if transformer_name not in k
+                }),
+                f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+            ckpt_config = json.dumps({
+                'type':
+                ckpt_name,
+                'base_dir':
+                f'{config.save_mp_checkpoint_path}',
+                'checkpoints': {
+                    "non_tp":
+                    ckpt_files,
+                    "tp": [
+                        f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions)
+                        for r in range(world_size)
+                    ]
+                },
+                'version':
+                1.0,
+                'parallelization':
+                'tp',
+                'tp_size':
+                world_size,
+                'dtype':
+                'int8' if quantize else ('float16' if fp16 else 'float32')
+            })
+            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json",
+                      "w") as cfg:
+                cfg.write(ckpt_config)
+
+        rep_sd = replaced_module.state_dict()
+        for n, p in replaced_module.named_parameters():
+            if hasattr(p, 'scale'):
+                rep_sd[n] = [p, p.scale]
+        keys = list(rep_sd.keys())
+        partition_size = (len(keys) // num_partitions + 1)
+        for m in range(num_partitions):
+            torch.save(
+                OrderedDict({
+                    k: [rep_sd[k],
+                        rep_sd[k].scale] if hasattr(rep_sd[k],
+                                                    'scale') else rep_sd[k]
+                    for k in keys[m * partition_size:(m + 1) * partition_size]
+                    if transformer_name in k
+                }),
+                f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
+
+    return replaced_module


 def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
@@ -751,4 +825,6 @@ def _replace_module(model, policies, layer_id=0):
        else:
            _, layer_id = _replace_module(child, policies, layer_id=layer_id)

+    # Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
+    model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
    return model, layer_id
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
-from abc import ABC
-
-import torch
-from torch.nn.parameter import Parameter
-
-
-class DSPolicy(ABC):
-    def __init__(self,
-                 inference=True,
-                 linear_layer=True,
-                 scale_attention=True,
-                 megatron_v2=False):
-        self.inference = inference
-        self.linear_layer = linear_layer
-        self.scale_attention = scale_attention
-        self.is_megatron_v2 = megatron_v2
-
-    def attention(self):
-        """
-        Returns attention qkv and dense parameters
-        weight: (3*hidden, hidden) and (hidden, hidden)
-        bias: (3*hidden) and (hidden)
-        """
-        raise NotImplementedError
-
-    def get_hidden_heads(self):
-        """
-        return hidden_size and number of heads
-        """
-        raise NotImplementedError
-
-    def mlp(self):
-        """
-        Returns mlp intermediate and output
-        weight: (intermediate, hidden) and (hidden, intermediate)
-        bias: (intermediate) and (hidden)
-        """
-        raise NotImplementedError
-
-    def layerNorm(self):
-        """
-        Returns LayerNorms used in transformer layer
-        Post-Attention and pre/post layer norm
-        gamma and beta with shape: (hidden)
-        """
-        raise NotImplementedError
-
-
-class HFBertLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=False, preln=False):
-        super().__init__(inference)
-        self.client_module = client_module
-        self.preln = preln
-        if HFBertLayerPolicy._orig_layer_class is None:
-            try:
-                import transformers
-                HFBertLayerPolicy._orig_layer_class = [
-                    transformers.models.bert.modeling_bert.BertLayer,
-                    transformers.models.roberta.modeling_roberta.RobertaLayer
-                ]
-            except:
-                HFBertLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.self.query.weight.shape[1], \
-                self.client_module.attention.self.num_attention_heads
-
-    def attention(self):
-        qw = self.client_module.attention.self.query.weight
-        qb = self.client_module.attention.self.query.bias
-        kw = self.client_module.attention.self.key.weight
-        kb = self.client_module.attention.self.key.bias
-        vw = self.client_module.attention.self.value.weight
-        vb = self.client_module.attention.self.value.bias
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-               qkvw, \
-               qkvb, \
-               self.client_module.attention.output.dense.weight, \
-               self.client_module.attention.output.dense.bias, \
-               self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        if self.preln:
-            intermediate_ff = self.client_module.intermediate.dense_act
-        else:
-            intermediate_ff = self.client_module.intermediate.dense
-
-        return self.linear_layer, intermediate_ff.weight, intermediate_ff.bias, \
-            self.client_module.output.dense.weight, \
-            self.client_module.output.dense.bias
-
-    def layerNorm(self):
-        if self.preln:
-            attention_layernorm = self.client_module.PostAttentionLayerNorm
-            transformer_layernorm = self.client_module.PreAttentionLayerNorm
-        else:
-            attention_layernorm = self.client_module.attention.output.LayerNorm
-            transformer_layernorm = self.client_module.output.LayerNorm
-        return attention_layernorm.weight, \
-               attention_layernorm.bias, \
-               transformer_layernorm.weight, \
-               transformer_layernorm.bias
-
-
-class HFGPTNEOLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference, scale_attention=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
-        except:
-            HFGPTNEOLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.attention.q_proj.weight.shape[1], \
-                self.client_module.attn.attention.num_heads
-
-    def attention(self):
-        qw = self.client_module.attn.attention.q_proj.weight
-        kw = self.client_module.attn.attention.k_proj.weight
-        vw = self.client_module.attn.attention.v_proj.weight
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-                qkvw, \
-                None, \
-                self.client_module.attn.attention.out_proj.weight, \
-                self.client_module.attn.attention.out_proj.bias, \
-                self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-                self.client_module.mlp.c_fc.weight, \
-                self.client_module.mlp.c_fc.bias, \
-                self.client_module.mlp.c_proj.weight, \
-                self.client_module.mlp.c_proj.bias
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight, \
-               self.client_module.ln_2.bias, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class HFGPTJLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference, scale_attention=True)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
-        except:
-            HFGPTJLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.q_proj.weight.shape[1], \
-                self.client_module.attn.num_attention_heads
-
-    def attention(self):
-        qw = self.client_module.attn.q_proj.weight
-        kw = self.client_module.attn.k_proj.weight
-        vw = self.client_module.attn.v_proj.weight
-
-        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
-
-        return self.linear_layer, \
-                qkvw, \
-                None, \
-                self.client_module.attn.out_proj.weight, \
-                None, \
-                self.scale_attention, \
-               self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-                self.client_module.mlp.fc_in.weight, \
-                self.client_module.mlp.fc_in.bias, \
-                self.client_module.mlp.fc_out.weight, \
-                self.client_module.mlp.fc_out.bias
-
-    def layerNorm(self):
-        return None, \
-               None, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class MegatronLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-    version = 0
-    moe_type = 'standard'
-
-    def __init__(self, client_module, inference=True):
-        super().__init__(inference)
-        self.client_module = client_module
-        # we use megatron version to differentiate between the old and new
-        # megatron-lm source code
-        if MegatronLayerPolicy._orig_layer_class is None:
-            try:
-                import megatron
-                from megatron.model.transformer import ParallelTransformerLayer
-                MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
-            except ImportError:
-                MegatronLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
-
-    def attention(self):
-        if self.inference:
-            if MegatronLayerPolicy.version == 0:
-                attention = self.client_module.attention
-            else:
-                attention = self.client_module.self_attention
-
-        return self.linear_layer, \
-                attention.query_key_value.weight, \
-                attention.query_key_value.bias, \
-                attention.dense.weight, \
-                attention.dense.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self, moe_type='standard'):
-        from deepspeed.moe.utils import has_moe_layers
-        moe, _ = has_moe_layers(self.client_module)
-
-        if moe:
-            moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
-                            self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
-            num_experts = len(moe_experts)
-            if moe_type == 'standard':
-                return self.linear_layer, \
-                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
-            else:
-
-                return self.linear_layer, \
-                    [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
-                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
-                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
-                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
-                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
-                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
-                    self.client_module.mlp.coefficient.weight
-
-        else:
-            return self.linear_layer, \
-                self.client_module.mlp.dense_h_to_4h.weight, \
-                self.client_module.mlp.dense_h_to_4h.bias, \
-                self.client_module.mlp.dense_4h_to_h.weight, \
-                self.client_module.mlp.dense_4h_to_h.bias
-
-    def layerNorm(self):
-        return self.client_module.post_attention_layernorm.weight, \
-               self.client_module.post_attention_layernorm.bias, \
-               self.client_module.input_layernorm.weight, \
-               self.client_module.input_layernorm.bias
-
-
-class HFGPT2LayerPolicy(DSPolicy):
-    _orig_layer_class = None
-
-    def __init__(self, client_module, inference=True):
-        # HuggingFace GPT2 uses convolutional layer instead of linear layer
-        super().__init__(inference, linear_layer=False)
-        self.client_module = client_module
-        try:
-            import transformers
-            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
-        except:
-            HFGPT2LayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        return self.client_module.attn.embed_dim, \
-                self.client_module.attn.num_heads
-
-    def attention(self):
-        return self.linear_layer, \
-                self.client_module.attn.c_attn.weight, \
-                self.client_module.attn.c_attn.bias, \
-                self.client_module.attn.c_proj.weight, \
-                self.client_module.attn.c_proj.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.c_fc.weight, \
-            self.client_module.mlp.c_fc.bias, \
-            self.client_module.mlp.c_proj.weight, \
-            self.client_module.mlp.c_proj.bias
-
-    def layerNorm(self):
-        return self.client_module.ln_2.weight, \
-               self.client_module.ln_2.bias, \
-               self.client_module.ln_1.weight, \
-               self.client_module.ln_1.bias
-
-
-class GPTNEOXLayerPolicy(DSPolicy):
-    _orig_layer_class = None
-    version = 0
-
-    def __init__(self, client_module, inference=True, megatron_v2=True):
-        super().__init__(inference, megatron_v2=megatron_v2)
-        self.client_module = client_module
-        if GPTNEOXLayerPolicy._orig_layer_class is None:
-            try:
-                import megatron
-                from megatron.model.transformer import ParallelTransformerLayerPipe
-                GPTNEOXLayerPolicy._orig_layer_class = ParallelTransformerLayerPipe
-            except ImportError:
-                GPTNEOXLayerPolicy._orig_layer_class = None
-
-    def get_hidden_heads(self):
-        if GPTNEOXLayerPolicy.version == 0:
-            attention = self.client_module.attention
-        else:
-            attention = self.client_module.self_attention
-
-        return self.client_module.attention.query_key_value.weight.shape[1], \
-                self.client_module.attention.num_attention_heads
-
-    def attention(self):
-        if GPTNEOXLayerPolicy.version == 0:
-            attention = self.client_module.attention
-        else:
-            attention = self.client_module.self_attention
-
-        return self.linear_layer, \
-                attention.query_key_value.weight, \
-                attention.query_key_value.bias, \
-                attention.dense.weight, \
-                attention.dense.bias, \
-                self.scale_attention, \
-                self.is_megatron_v2
-
-    def mlp(self):
-        return self.linear_layer, \
-            self.client_module.mlp.dense_h_to_4h.weight, \
-            self.client_module.mlp.dense_h_to_4h.bias, \
-            self.client_module.mlp.dense_4h_to_h.weight, \
-            self.client_module.mlp.dense_4h_to_h.bias
-
-    def layerNorm(self):
-        return self.client_module.post_attention_layernorm.weight, \
-               self.client_module.post_attention_layernorm.bias, \
-               self.client_module.input_layernorm.weight, \
-               self.client_module.input_layernorm.bias
-
-
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+from .containers import HFGPT2LayerPolicy
+from .containers import HFBertLayerPolicy
+from .containers import BLOOMLayerPolicy
+from .containers import HFGPTJLayerPolicy
+from .containers import HFGPTNEOLayerPolicy
+from .containers import GPTNEOXLayerPolicy
+from .containers import HFOPTLayerPolicy
+from .containers import MegatronLayerPolicy
+from .containers import HFDistilBertLayerPolicy
+from .containers import HFCLIPLayerPolicy
+from .containers import UNetPolicy
+from .containers import VAEPolicy
+
+# transformer-based policies
 replace_policies = [
    HFBertLayerPolicy,
    HFGPTNEOLayerPolicy,
@@ -376,4 +22,11 @@ replace_policies = [
    HFGPTJLayerPolicy,
    MegatronLayerPolicy,
    HFGPT2LayerPolicy,
+    BLOOMLayerPolicy,
+    HFOPTLayerPolicy,
+    HFCLIPLayerPolicy,
+    HFDistilBertLayerPolicy
 ]
+
+# non-transformer-based policies
+generic_policies = [UNetPolicy, VAEPolicy]
--- a/deepspeed/module_inject/utils.py
+++ b/deepspeed/module_inject/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from deepspeed.utils import log_dist
+
+
+# helper function to map between DS policies and DS containers
+def policy_to_ds_container(**kwargs):
+    from .containers import HFGPT2LayerPolicy, DS_GPT2Container
+    from .containers import HFBertLayerPolicy, DS_BERTContainer
+    from .containers import BLOOMLayerPolicy, DS_BloomContainer
+    from .containers import HFGPTJLayerPolicy, DS_GPTJContainer
+    from .containers import HFGPTNEOLayerPolicy, DS_GPTNEOContainer
+    from .containers import GPTNEOXLayerPolicy, DS_GPTNEOXContainer
+    from .containers import HFOPTLayerPolicy, DS_OPTContainer
+    from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
+    from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
+
+    policy_to_container = {
+        HFGPT2LayerPolicy: DS_GPT2Container,
+        HFBertLayerPolicy: DS_BERTContainer,
+        BLOOMLayerPolicy: DS_BloomContainer,
+        HFGPTJLayerPolicy: DS_GPTJContainer,
+        HFGPTNEOLayerPolicy: DS_GPTNEOContainer,
+        GPTNEOXLayerPolicy: DS_GPTNEOXContainer,
+        HFOPTLayerPolicy: DS_OPTContainer,
+        MegatronLayerPolicy: DS_MegatronGPTContainer,
+        HFDistilBertLayerPolicy: DS_DistilBERTContainer,
+    }
+
+    container = None
+    policy = kwargs['policy']
+    assert policy is not None, "Policy cannot be None"
+    policy_type = type(policy)
+
+    if policy_type not in policy_to_container:
+        log_dist(f"Policy type {policy_type} not supported", [0])
+    else:
+        container = policy_to_container[policy_type](**kwargs)
+
+    return container
--- a/deepspeed/moe/__init__.py
+++ b/deepspeed/moe/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -2,20 +2,35 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 '''

-import torch.nn.init as init
 import torch
-import torch.distributed as dist

-from deepspeed.utils import logger, log_dist
+from deepspeed.utils import log_dist

-import deepspeed.utils.groups as groups
+from deepspeed.utils import groups
 from .sharded_moe import MOELayer, TopKGate
 from .experts import Experts
-import copy
 import typing


 class MoE(torch.nn.Module):
+    """Initialize an MoE layer.
+
+    Arguments:
+        hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
+        expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
+        num_experts (int, optional): default=1, the total number of experts per layer.
+        ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
+        k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
+        capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
+        eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
+        min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
+        use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
+        noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
+        drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
+        use_rts (bool, optional): default=True, whether to use Random Token Selection.
+        use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
+        enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
+    """
    def __init__(self,
                 hidden_size,
                 expert,
@@ -29,37 +44,21 @@ class MoE(torch.nn.Module):
                 noisy_gate_policy: typing.Optional[str] = None,
                 drop_tokens: bool = True,
                 use_rts=True,
-                 use_tutel: bool = False):
-        """Initialize an MoE layer.
-
-        Arguments:
-            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
-            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
-            num_experts (int, optional): default=1, the total number of experts per layer.
-            ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
-            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
-            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
-            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
-            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
-            use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
-            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
-            drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
-            use_rts (bool, optional): default=True, whether to use Random Token Selection.
-            use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
-        """
+                 use_tutel: bool = False,
+                 enable_expert_tensor_parallelism: bool = False):

        super(MoE, self).__init__()

        self.use_residual = use_residual
-        self.ep_size = min(
-            ep_size,
-            num_experts)  # the ep size should be less than the number of experts
+        self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
+        assert num_experts % ep_size == 0, f"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})"
+        self.ep_size = ep_size
        self.expert_group_name = f"ep_size_{self.ep_size}"
        self.num_experts = num_experts
-        self.num_local_experts = 1 if num_experts < ep_size else num_experts // ep_size
+        self.num_local_experts = num_experts // self.ep_size

        log_dist(
-            f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {ep_size}',
+            f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}',
            [0])

        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
@@ -94,9 +93,12 @@ class MoE(torch.nn.Module):
            print(
                f"No existing process group found, creating a new group named: {self.expert_group_name}"
            )
-            if groups.mpu is None:
+            if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
+                # Condition 1 - no groups.mpu means no tensor parallelism
+                # Condition 2 - disabling expert tensor parallelism on purpose
                groups._create_expert_and_data_parallel(self.ep_size)
            else:
+                # expert tensor parallelism is enabled
                groups._create_expert_data_and_model_parallel(self.ep_size,
                                                              mpu=groups.mpu)
        # Set the group handle for the MOELayer (deepspeed_moe) object
@@ -126,6 +128,6 @@ class MoE(torch.nn.Module):
            if type(output_mlp) is tuple:
                output_mlp = output_mlp[0]  # Ignore the bias term for now
            coef = self.coefficient(hidden_states)
-            coef = torch.nn.functional.softmax(coef, dim=1)
+            coef = torch.nn.functional.softmax(coef, dim=-1)
            output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
        return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts
--- a/deepspeed/moe/mappings.py
+++ b/deepspeed/moe/mappings.py
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+
+# The file has been adapted from the following Megatron-LM file:
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/mappings.py
+# Git commit hash: 9dc3c42a84aa656f583703cf8b6b4f79f712b796
+# We retain the following copyright from the original files:
+
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import deepspeed
+
+
+def _gather_tokens(input_, dim=0):
+    """Gather tensors and concatenate them along a dimension"""
+    mpu = deepspeed.utils.groups.mpu
+
+    input_ = input_.contiguous()
+    # Size and dimension.
+    rank = mpu.get_tensor_model_parallel_rank()
+
+    tensor_list = [
+        torch.empty_like(input_)
+        for _ in range(mpu.get_tensor_model_parallel_world_size())
+    ]
+    tensor_list[rank] = input_
+    deepspeed.comm.all_gather(tensor_list,
+                              input_,
+                              group=mpu.get_tensor_model_parallel_group())
+
+    # Note: torch.cat already creates a contiguous tensor.
+    output = torch.cat(tensor_list, dim=dim).contiguous()
+
+    return output
+
+
+def _drop_tokens(input_, dim=0):
+    """Divide a tensor among the tensor parallel ranks"""
+    mpu = deepspeed.utils.groups.mpu
+
+    total_chunks = mpu.get_tensor_model_parallel_world_size()
+    this_chunk = mpu.get_tensor_model_parallel_rank()
+    assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
+    chunk_size = input_.shape[dim] // total_chunks
+
+    return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
+
+
+class _GatherTokens(torch.autograd.Function):
+    """All gather tokens among the tensor parallel ranks"""
+    @staticmethod
+    def symbolic(graph, input_, dim):
+        return _gather_tokens(input_, dim)
+
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        return _gather_tokens(input_, dim)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _drop_tokens(grad_output, ctx.dim), None
+
+
+class _DropTokens(torch.autograd.Function):
+    "Divide tokens equally among the tensor parallel ranks"
+
+    @staticmethod
+    def symbolic(graph, input_, dim):
+        return _drop_tokens(input_, dim)
+
+    @staticmethod
+    def forward(ctx, input_, dim):
+        ctx.dim = dim
+        return _drop_tokens(input_, dim)
+
+    @staticmethod
+    def backward(ctx, input_):
+        return _gather_tokens(input_, ctx.dim), None
+
+
+def gather_tokens(input_, dim=0):
+    mpu = deepspeed.utils.groups.mpu
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+        # no tensor parallelism for non-experts
+        return input_
+    return _GatherTokens.apply(input_, dim)
+
+
+def drop_tokens(input_, dim=0):
+    mpu = deepspeed.utils.groups.mpu
+    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+        # no tensor parallelism for non-experts
+        return input_
+    return _DropTokens.apply(input_, dim)
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -12,17 +12,16 @@ Copyright 2021 The Microsoft DeepSpeed Team
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.

-from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
-from deepspeed.utils import logger, log_dist
-from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple, Union, cast
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.utils import logger
+from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple

-import time
-from time import perf_counter
 import torch
 from torch import Tensor
-import torch.distributed as dist
-from torch.nn import Module, ModuleList
+from torch.nn import Module
 import torch.nn.functional as F
+from deepspeed.utils import groups
+from .mappings import drop_tokens, gather_tokens

 if TYPE_CHECKING:
    Base = Module[Tensor]
@@ -80,12 +79,20 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
    return gumbel(shape)


+from deepspeed import comm as dist
+
+# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
+# See https://arxiv.org/pdf/2006.16668.pdf for details.
+
+
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
    @staticmethod
-    def forward(ctx: Any,
-                group: dist.ProcessGroup,
-                input: Tensor) -> Tensor:  # type: ignore
+    def forward(
+            ctx: Any,
+            # TODO: replace with DS process group
+            group: torch.distributed.ProcessGroup,
+            input: Tensor) -> Tensor:  # type: ignore
        ctx.group = group
        input = input.contiguous()
        output = torch.empty_like(input)
@@ -206,7 +213,7 @@ def top1gating(logits: Tensor,
    # if we don't want to drop any tokens
    if not drop_tokens:
        new_capacity = torch.max(exp_counts).to(logits.device)
-        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.group.WORLD)
+        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
        capacity = new_capacity

    # Compute l_aux
@@ -424,7 +431,7 @@ class TopKGate(Module):

        if self.wall_clock_breakdown:
            self.timers('TopKGate').stop()
-            self.gate_time = self.timers('TopKGate').elapsed(reset=False) * 1000
+            self.gate_time = self.timers('TopKGate').elapsed(reset=False)

        return gate_output

@@ -466,13 +473,17 @@ class MOELayer(Base):
        self.timers = SynchronizedWallClockTimer()
        self.wall_clock_breakdown = False

-        self.use_tutel = use_tutel and TUTEL_INSTALLED
+        self.use_tutel = use_tutel and TUTEL_INSTALLED and gate.k == 1

        if self.use_tutel:
            logger.info('Using Tutel optimizations.')
        elif use_tutel and not TUTEL_INSTALLED:
            logger.warning("Tutel optimization requested but not installed. "
                           "Proceeding without Tutel.")
+        elif use_tutel and TUTEL_INSTALLED and gate.k != 1:
+            logger.warning(
+                "To enable Tutel optimization, use top-1 instead of top-2 gate. "
+                "Proceeding without Tutel.")

    def _set_ep_group(self, ep_group):
        self.ep_group = ep_group
@@ -511,11 +522,20 @@ class MOELayer(Base):
        if self.wall_clock_breakdown:
            self.timers('falltoall').start()

+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, it will create
+            # duplicate tokens on the tensor-parallel ranks.
+            # Since our experts are not tensor-parallel, these duplicates
+            # need to be dropped to ensure correctness.
+            # this also doubles up as a communication optimization as we are
+            # reducing the all-to-all communication volume.
+            dispatched_input = drop_tokens(dispatched_input, dim=1)
+
        dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)

        if self.wall_clock_breakdown:
            self.timers('falltoall').stop()
-            self.time_falltoall = self.timers('falltoall').elapsed(reset=False) * 1000
+            self.time_falltoall = self.timers('falltoall').elapsed(reset=False)

        # Re-shape after all-to-all: ecm -> gecm
        dispatched_input = dispatched_input.reshape(self.ep_size,
@@ -532,13 +552,19 @@ class MOELayer(Base):

        if self.wall_clock_breakdown:
            self.timers('salltoall').stop()
-            self.time_salltoall = self.timers('salltoall').elapsed(reset=False) * 1000
+            self.time_salltoall = self.timers('salltoall').elapsed(reset=False)

        # Re-shape back: gecm -> ecm
        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts,
                                              -1,
                                              d_model)

+        if groups._get_expert_model_parallel_world_size() == 1:
+            # the dropped duplicate tokens need to be gathered on each
+            # tensor parallel rank again for the tensor-parallel
+            # non-expert of the next layer.
+            expert_output = gather_tokens(expert_output, dim=1)
+
        if self.use_tutel:
            combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
        else:
@@ -550,6 +576,6 @@ class MOELayer(Base):

        if self.wall_clock_breakdown:
            self.timers('moe').stop()
-            self.time_moe = self.timers('moe').elapsed(reset=False) * 1000
+            self.time_moe = self.timers('moe').elapsed(reset=False)

        return a
--- a/deepspeed/moe/utils.py
+++ b/deepspeed/moe/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
 from typing import List, Tuple, Dict
 import torch
-import deepspeed.utils.groups as groups
 from .layer import MoE


 def has_moe_layers(m):
    has_moe = False
    num_experts = 0
+
    for _, module in m.named_modules():
        if isinstance(module, MoE):
            has_moe = True
@@ -59,8 +61,9 @@ def split_params_grads_into_shared_and_expert_params(
    return shared_grads, expert_grads


-def split_params_into_different_moe_groups_for_optimizer(
-        param_groups: Tuple[Dict]) -> Tuple[Dict]:
+def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
+                                                         max_group_size=178956971
+                                                         ) -> Tuple[Dict]:
    """Split parameters into different MoE groups for optimizer

    Args:
@@ -112,8 +115,32 @@ def split_params_into_different_moe_groups_for_optimizer(
        param_group['params'] = new_params

    # Flatten the moe groups
-    for k, v in group_moe.items():
-        for k1, v1 in v.items():
-            param_groups.append(v1)
+    if max_group_size is not None:
+        for k, v in group_moe.items():
+            for k1, v1 in v.items():
+                cur_group = []
+                all_groups = []
+                size_of_cur_group = 0
+                for param in v1['params']:
+                    if size_of_cur_group + param.numel() <= max_group_size:
+                        cur_group.append(param)
+                        size_of_cur_group += param.numel()
+                    else:
+                        all_groups.append(cur_group)
+                        cur_group = [param]
+                        size_of_cur_group = param.numel()
+                if cur_group:
+                    all_groups.append(cur_group)
+                for group in all_groups:
+                    new_dict = {}
+                    for key, val in v1.items():
+                        if key != 'params':
+                            new_dict[key] = val
+                    new_dict['params'] = group
+                    param_groups.append(new_dict)
+    else:
+        for k, v in group_moe.items():
+            for k1, v1 in v.items():
+                param_groups.append(v1)

    return tuple(param_groups)
--- a/deepspeed/monitor/__init__.py
+++ b/deepspeed/monitor/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from pydantic import root_validator
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+
+
+def get_monitor_config(param_dict):
+    monitor_dict = {
+        key: param_dict.get(key,
+                            {})
+        for key in ("tensorboard",
+                    "wandb",
+                    "csv_monitor")
+    }
+    return DeepSpeedMonitorConfig(**monitor_dict)
+
+
+class TensorBoardConfig(DeepSpeedConfigModel):
+    """Sets parameters for TensorBoard monitor."""
+
+    enabled: bool = False
+    """ Whether logging to Tensorboard is enabled. Requires `tensorboard` package is installed. """
+
+    output_path: str = ""
+    """
+    Path to where the Tensorboard logs will be written. If not provided, the
+    output path is set under the training script’s launching path.
+    """
+
+    job_name: str = "DeepSpeedJobName"
+    """ Name for the current job. This will become a new directory inside `output_path`. """
+
+
+class WandbConfig(DeepSpeedConfigModel):
+    """Sets parameters for WandB monitor."""
+
+    enabled: bool = False
+    """ Whether logging to WandB is enabled. Requires `wandb` package is installed. """
+
+    group: str = None
+    """ Name for the WandB group. This can be used to group together runs. """
+
+    team: str = None
+    """ Name for the WandB team. """
+
+    project: str = "deepspeed"
+    """ Name for the WandB project. """
+
+
+class CSVConfig(DeepSpeedConfigModel):
+    """Sets parameters for CSV monitor."""
+
+    enabled: bool = False
+    """ Whether logging to local CSV files is enabled. """
+
+    output_path: str = ""
+    """
+    Path to where the csv files will be written. If not provided, the output
+    path is set under the training script’s launching path.
+    """
+
+    job_name: str = "DeepSpeedJobName"
+    """ Name for the current job. This will become a new directory inside `output_path`. """
+
+
+class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
+    """Sets parameters for various monitoring methods."""
+
+    tensorboard: TensorBoardConfig = {}
+    """ TensorBoard monitor, requires `tensorboard` package is installed. """
+
+    wandb: WandbConfig = {}
+    """ WandB monitor, requires `wandb` package is installed. """
+
+    csv_monitor: CSVConfig = {}
+    """ Local CSV output of monitoring data. """
+    @root_validator
+    def check_enabled(cls, values):
+        values["enabled"] = False
+        if (values.get("tensorboard").enabled or values.get("wandb").enabled
+                or values.get("csv_monitor").enabled):
+            values["enabled"] = True
+        return values
--- a/deepspeed/monitor/csv_monitor.py
+++ b/deepspeed/monitor/csv_monitor.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .monitor import Monitor
+import os
+
+import deepspeed.comm as dist
+
+
+class csvMonitor(Monitor):
+    def __init__(self, csv_config):
+        super().__init__(csv_config)
+        self.filenames = []
+        self.enabled = csv_config.enabled
+        self.output_path = csv_config.output_path
+        self.job_name = csv_config.job_name
+        self.log_dir = self.setup_log_dir()
+
+    def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "csv_monitor")):
+        if self.enabled and dist.get_rank() == 0:
+            if self.output_path is not None:
+                log_dir = os.path.join(self.output_path, self.job_name)
+            # NOTE: This code path currently is never used since the default tensorboard_output_path is an empty string and not None. Saving it in case we want this functionality in the future.
+            else:
+                if "DLWS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLWS_JOB_ID"]
+                elif "DLTS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLTS_JOB_ID"]
+                else:
+                    infra_job_id = "unknown-job-id"
+
+                csv_monitor_dir_name = os.path.join(infra_job_id, "logs")
+                log_dir = os.path.join(base, csv_monitor_dir_name, self.job_name)
+            os.makedirs(log_dir, exist_ok=True)
+            return log_dir
+
+    def write_events(self, event_list):
+        if self.enabled and dist.get_rank() == 0:
+            import csv
+            # We assume each event_list element is a tensorboard-style tuple in the format: (log_name: String, value, step: Int)
+            for event in event_list:
+                log_name = event[0]
+                value = event[1]
+                step = event[2]
+
+                # Set the header to the log_name
+                # Need this check because the deepspeed engine currently formats log strings to separate with '/'
+                if '/' in log_name:
+                    record_splits = log_name.split('/')
+                    header = record_splits[len(record_splits) - 1]
+                else:
+                    header = log_name
+
+                # sanitize common naming conventions into filename
+                filename = log_name.replace('/', '_').replace(' ', '_')
+                fname = self.log_dir + '/' + filename + '.csv'
+
+                # Open file and record event. Insert header if this is the first time writing
+                with open(fname, 'a+') as csv_monitor_file:
+                    csv_monitor_writer = csv.writer(csv_monitor_file)
+                    if filename not in self.filenames:
+                        self.filenames.append(filename)
+                        csv_monitor_writer.writerow(['step', header])
+                    csv_monitor_writer.writerow([step, value])
--- a/deepspeed/monitor/monitor.py
+++ b/deepspeed/monitor/monitor.py
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+ Support different forms of monitoring such as wandb and tensorboard
+"""
+
+from abc import ABC, abstractmethod
+import deepspeed.comm as dist
+
+
+class Monitor(ABC):
+    @abstractmethod
+    def __init__(self, monitor_config):
+        self.monitor_config = monitor_config
+
+    @abstractmethod
+    def write_events(self, event_list):
+        pass
+
+
+from .wandb import WandbMonitor
+from .tensorboard import TensorBoardMonitor
+from .csv_monitor import csvMonitor
+
+
+class MonitorMaster(Monitor):
+    def __init__(self, monitor_config):
+        super().__init__(monitor_config)
+        self.tb_monitor = None
+        self.wandb_monitor = None
+        self.csv_monitor = None
+        self.enabled = monitor_config.enabled
+
+        if dist.get_rank() == 0:
+            if monitor_config.tensorboard.enabled:
+                self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard)
+            if monitor_config.wandb.enabled:
+                self.wandb_monitor = WandbMonitor(monitor_config.wandb)
+            if monitor_config.csv_monitor.enabled:
+                self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
+
+    def write_events(self, event_list):
+        if dist.get_rank() == 0:
+            if self.tb_monitor is not None:
+                self.tb_monitor.write_events(event_list)
+            if self.wandb_monitor is not None:
+                self.wandb_monitor.write_events(event_list)
+            if self.csv_monitor is not None:
+                self.csv_monitor.write_events(event_list)
--- a/deepspeed/monitor/tensorboard.py
+++ b/deepspeed/monitor/tensorboard.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .utils import check_tb_availability
+from .monitor import Monitor
+import os
+
+import deepspeed.comm as dist
+
+
+class TensorBoardMonitor(Monitor):
+    def __init__(self, tensorboard_config):
+        super().__init__(tensorboard_config)
+        check_tb_availability()
+
+        self.summary_writer = None
+        self.enabled = tensorboard_config.enabled
+        self.output_path = tensorboard_config.output_path
+        self.job_name = tensorboard_config.job_name
+
+        if self.enabled and dist.get_rank() == 0:
+            self.get_summary_writer()
+
+    def get_summary_writer(self,
+                           base=os.path.join(os.path.expanduser("~"),
+                                             "tensorboard")):
+        if self.enabled and dist.get_rank() == 0:
+            from torch.utils.tensorboard import SummaryWriter
+            if self.output_path is not None:
+                log_dir = os.path.join(self.output_path, self.job_name)
+            # NOTE: This code path currently is never used since the default output_path is an empty string and not None. Saving it in case we want this functionality in the future.
+            else:
+                if "DLWS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLWS_JOB_ID"]
+                elif "DLTS_JOB_ID" in os.environ:
+                    infra_job_id = os.environ["DLTS_JOB_ID"]
+                else:
+                    infra_job_id = "unknown-job-id"
+
+                summary_writer_dir_name = os.path.join(infra_job_id, "logs")
+                log_dir = os.path.join(base, summary_writer_dir_name, self.output_path)
+            os.makedirs(log_dir, exist_ok=True)
+            self.summary_writer = SummaryWriter(log_dir=log_dir)
+        return self.summary_writer
+
+    def write_events(self, event_list, flush=True):
+        if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
+            for event in event_list:
+                self.summary_writer.add_scalar(*event)
+            if flush:
+                self.summary_writer.flush()
+
+    def flush(self):
+        if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
+            self.summary_writer.flush()
--- a/deepspeed/monitor/utils.py
+++ b/deepspeed/monitor/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+
+def check_tb_availability():
+    try:
+        # torch.utils.tensorboard will fail if `tensorboard` is not available,
+        # see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
+        import tensorboard  # noqa: F401
+    except ImportError:
+        print('If you want to use tensorboard logging, please `pip install tensorboard`')
+        raise
+
+
+def check_wandb_availability():
+    try:
+        import wandb  # noqa: F401
+    except ImportError:
+        print(
+            'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
+        )
+        raise
--- a/deepspeed/monitor/wandb.py
+++ b/deepspeed/monitor/wandb.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .utils import check_wandb_availability
+from .monitor import Monitor
+
+import deepspeed.comm as dist
+
+
+class WandbMonitor(Monitor):
+    def __init__(self, wandb_config):
+        super().__init__(wandb_config)
+        check_wandb_availability()
+        import wandb
+
+        self.enabled = wandb_config.enabled
+        self.group = wandb_config.group
+        self.team = wandb_config.team
+        self.project = wandb_config.project
+
+        if self.enabled and dist.get_rank() == 0:
+            wandb.init(project=self.project, group=self.group, entity=self.team)
+
+    def log(self, data, step=None, commit=None, sync=None):
+        if self.enabled and dist.get_rank() == 0:
+            import wandb
+            return wandb.log(data, step=step, commit=commit, sync=sync)
+
+    def write_events(self, event_list):
+        if self.enabled and dist.get_rank() == 0:
+            for event in event_list:
+                label = event[0]
+                value = event[1]
+                step = event[2]
+                self.log({label: value}, step=step)
--- a/deepspeed/nebula/__init__.py
+++ b/deepspeed/nebula/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
--- a/deepspeed/nebula/config.py
+++ b/deepspeed/nebula/config.py
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
+from deepspeed.nebula.constants import *
+
+
+class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
+    def __init__(self, param_dict):
+        super(DeepSpeedNebulaConfig, self).__init__()
+
+        self.enabled = None
+        self.persistent_storage_path = None
+        self.persistent_time_interval = None
+        self.num_of_version_in_retention = None
+        self.enable_nebula_load = None
+
+        if NEBULA in param_dict.keys():
+            nebula_dict = param_dict[NEBULA]
+        else:
+            nebula_dict = {}
+
+        self._initialize(nebula_dict)
+
+    def _initialize(self, nebula_dict):
+        self.enabled = get_scalar_param(nebula_dict,
+                                        NEBULA_ENABLED,
+                                        NEBULA_ENABLED_DEFAULT)
+
+        self.load_path = get_scalar_param(nebula_dict,
+                                          NEBULA_LOAD_PATH,
+                                          NEBULA_LOAD_PATH_DEFAULT)
+
+        self.enable_nebula_load = get_scalar_param(nebula_dict,
+                                                   NEBULA_ENABLE_NEBULA_LOAD,
+                                                   NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
+
+        self.persistent_storage_path = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_STORAGE_PATH,
+            NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
+
+        self.persistent_time_interval = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_TIME_INTERVAL,
+            NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
+
+        self.num_of_version_in_retention = get_scalar_param(
+            nebula_dict,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
--- a/deepspeed/nebula/constants.py
+++ b/deepspeed/nebula/constants.py
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# nebula
+#########################################
+# Nebula. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+NEBULA_FORMAT = '''
+nebula should be enabled as:
+"session_params": {
+  "nebula": {
+        "enabled": true,
+        "persistent_storage_path": "/foo/bar",
+        "persistent_time_interval": 100,
+        "num_of_version_in_retention": 2,
+        "enable_nebula_load": true
+    }
+}
+'''
+
+NEBULA = "nebula"
+
+NEBULA_ENABLED = "enabled"
+NEBULA_ENABLED_DEFAULT = False
+
+# There is a case where customer want to load the checkpoint saved
+# by raw torch. Because nebula cannot load torch checkpoint directly
+# as they have different folder structures to bring the gap for
+# loading(the data are totaly same in bytes for torch and enbula s
+# aving).
+# In this case, we must disable nebula load to use raw torch load.
+# Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
+# original way of deepspeed to load, i.e. set the value of "--load".
+NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
+NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
+
+# When you want to resume the previous checkpoint saved by nebula,
+# you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
+# If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
+# will be the default path to load.
+NEBULA_LOAD_PATH = "nebula_load_path"
+NEBULA_LOAD_PATH_DEFAULT = None
+
+# Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
+# asynchronous way.
+NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
+NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
+
+# Time interval to trigger the nebula persistence.
+NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
+NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
+
+# Checkpoint number which will be kept in memory. Let us say,
+# if the value is 2. Then we have checkpoints 1 and 2 are ready
+# now. When it comes to checkpoint 3, the 1 will be removed if
+# 1 has been persisted to disk.
+NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
+NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
+
+# Neubla envs
+NEBULA_EXPORT_ENVS = [
+    'DLTS_JOB_ID',
+    'DLTS_NUM_WORKER',
+    'NEBULA_PERSISTENT_STORAGE_PATH',
+    'NEBULA_PERSISTENT_TIME_INTERVAL',
+    'AML_RUN_ID',
+    'AZUREML_RUN_TOKEN',
+    'AZUREML_WORKSPACE_SCOPE',
+    'AZUREML_EXPERIMENT_SCOPE',
+    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
+    'AZUREML_RUN_ID',
+    'NEBULA_MEMORY_BUFFER_SIZE',
+    'AZUREML_PARAMETER_ITPJOB_NAME',
+    'FC_TASKROLE_NAME',
+    'FC_TASK_INDEX',
+    'MASTER_HOST',
+    'LOCAL_HOST',
+    'AZUREML_BLOB_ACCOUNT_NAME',
+    'AZUREML_BLOB_ACCOUNT_KEY'
+]
+
+# ITP env files
+DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'