push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/deepspeed/module_inject/containers/base_moe.py
+++ b/deepspeed/module_inject/containers/base_moe.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# Create a container object to save model-specific tensors using the policy file above.
+from .base import *
+from deepspeed import comm as dist
+import deepspeed.ops.transformer as transformer_inference
+from deepspeed.accelerator import get_accelerator
+
+
+class BaseTransformerMoEContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        # Call the init function of the parent class to initialize the tensors and configs from parent class
+        super().__init__(**kwargs)
+
+        self.num_experts = self.policy.get_num_experts()
+        self.ep_world_size = dist.get_world_size()
+        self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
+
+        self.layer_norm_eps = self.config.layer_norm_eps if hasattr(
+            self.config,
+            'layer_norm_eps') else 1e-12,
+
+        # MoE models will have a list of mlp related tensors
+        self._h4h_w = []
+        self._h4h_b = []
+        self._4hh_w = []
+        self._4hh_b = []
+
+        # Residual MoE needs extra parameters
+        self._res_h4h_w = None
+        self._res_h4h_b = None
+        self._res_4hh_w = None
+        self._res_4hh_b = None
+        self._res_coef = None
+
+    def create_ds_model_config(self):
+        self.set_hidden_heads(*self.policy.get_hidden_heads())
+        assert self.num_attention_heads % self.mp_size == 0,\
+                "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
+                "This is because the attention computation is partitioned evenly among the parallel GPUs."
+
+        self.ds_model_config = transformer_inference.DeepSpeedMoEInferenceConfig(
+            hidden_size=self.hidden_size,
+            heads=self.num_attention_heads,
+            layer_norm_eps=self.layer_norm_eps,
+            fp16=self.fp16,
+            pre_layer_norm=self.pre_layer_norm,
+            mp_size=self.mp_size,
+            q_int8=self.quantize,
+            moe_experts=self.local_ep_size,
+            global_experts=self.num_experts,
+            mlp_type=self.config.moe.type,
+            scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
+        )
+
+        return self.ds_model_config
+
+    def initialize_tensors(self):
+        # Set the tensors from policy (user module) to container (DS module)
+        self.set_attention(*self.policy.attention())
+        self.set_mlp(self.config.moe.type)
+        self.set_layernorm(*self.policy.layernorm())
+
+    def set_mlp(self, config_moe_type):
+        if config_moe_type == 'standard':
+            self._h4h_w, self._h4h_b, \
+            self._4hh_w, self._4hh_b = self.policy.mlp()
+        else:
+            self._h4h_w, self._h4h_b, self._4hh_w, \
+            self._4hh_b, self._res_h4h_w, self._res_h4h_b, \
+            self._res_4hh_w, self._res_4hh_b, \
+            self._res_coef = self.policy.mlp(config_moe_type)
+
+    def transpose(self):
+        self.transpose_attention()
+        self.transpose_mlp()
+
+        if self.config.moe.type == 'residual':
+            self.transpose_residual()
+
+    def transpose_mlp(self):
+        self._h4h_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._h4h_w]
+        self._4hh_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._4hh_w]
+
+    def transpose_residual(self):
+        self._res_h4h_w.data = self.transpose_impl(self._res_h4h_w.data)
+        self._res_4hh_w.data = self.transpose_impl(self._res_4hh_w.data)
+        self._res_coef.data = self.transpose_impl(self._res_coef.data)
+
+    def apply_tensor_parallelism(self, mp_replace):
+        # setup the new Attention module
+        self.attention_qkv_mp(mp_replace)
+        self.attention_o_mp(mp_replace)
+
+        # quantize attention weights
+        self.attention_quantization()
+
+        # setup the new MLP module
+        self.mlp_mp()
+
+    def mlp_mp(self):
+        gpu_index = dist.get_rank()
+        for ep_index in range(self.local_ep_size):
+            # mlp inter
+            self.module.mlp[ep_index].inter_w.data = self._h4h_w[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].inter_b.data = self._h4h_b[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+
+            # mlp output
+            self.module.mlp[ep_index].output_w.data = self._4hh_w[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+            self.module.mlp[ep_index].output_b.data = self._4hh_b[
+                gpu_index * self.local_ep_size + ep_index].to(
+                    get_accelerator().current_device_name())
+
+    def copy_data_to_new_module(self):
+        self.module.attn_nw.data = self.attn_nw.to(
+            get_accelerator().current_device_name())
+        self.module.attn_nb.data = self.attn_nb.to(
+            get_accelerator().current_device_name())
+
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
+
+        if self.config.moe.type == 'residual':
+            self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.output_w.data = self._res_4hh_w.to(
+                get_accelerator().current_device_name())
+            self.module.res_mlp.output_b.data = self._res_4hh_b.to(
+                get_accelerator().current_device_name())
+            self.module.res_coef.data = self._res_coef.to(
+                get_accelerator().current_device_name())
--- a/deepspeed/module_inject/containers/bert.py
+++ b/deepspeed/module_inject/containers/bert.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_BERTContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.return_tuple = True
+        self.triangular_masking = False
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFBertLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=False):
+        super().__init__(inference, pre_attn_norm=False)
+        self.client_module = client_module
+        self.cuda_graph_supported = True
+
+        if HFBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFBertLayerPolicy._orig_layer_class = [
+                    transformers.models.bert.modeling_bert.BertLayer,
+                    transformers.models.roberta.modeling_roberta.RobertaLayer
+                ]
+            except:
+                HFBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.self.query.weight.shape[1], \
+                self.client_module.attention.self.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attention.self.query.weight
+        qb = self.client_module.attention.self.query.bias
+        kw = self.client_module.attention.self.key.weight
+        kb = self.client_module.attention.self.key.bias
+        vw = self.client_module.attention.self.value.weight
+        vb = self.client_module.attention.self.value.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.attention.output.dense.weight, \
+               self.client_module.attention.output.dense.bias, \
+
+    def mlp(self):
+        if self.pre_attn_norm:
+            intermediate_ff = self.client_module.intermediate.dense_act
+        else:
+            intermediate_ff = self.client_module.intermediate.dense
+
+        return intermediate_ff.weight, intermediate_ff.bias, \
+            self.client_module.output.dense.weight, \
+            self.client_module.output.dense.bias
+
+    def layernorm(self):
+        if self.pre_attn_norm:
+            attention_layernorm = self.client_module.PostAttentionLayerNorm
+            transformer_layernorm = self.client_module.PreAttentionLayerNorm
+        else:
+            attention_layernorm = self.client_module.attention.output.LayerNorm
+            transformer_layernorm = self.client_module.output.LayerNorm
+        return attention_layernorm.weight, \
+               attention_layernorm.bias, \
+               transformer_layernorm.weight, \
+               transformer_layernorm.bias
--- a/deepspeed/module_inject/containers/bloom.py
+++ b/deepspeed/module_inject/containers/bloom.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+
+supported_models = {None}
+
+
+class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.bigscience_bloom = True
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+
+        self.module = DeepSpeedBloomInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def attention_qkv_mp(self, mp_replace):
+        self.module.attention.attn_qkvw = mp_replace.copy(
+            self.module.attention.attn_qkvw,
+            self.qkvw)
+        self.module.attention.attn_qkvb = mp_replace.copy(
+            self.module.attention.attn_qkvb,
+            self.qkvb)
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attention.query_key_value.weight', \
+            'self_attention.query_key_value.bias', \
+            'self_attention.dense.weight', \
+            'self_attention.dense.bias', \
+            'mlp.dense_h_to_4h.weight', \
+            'mlp.dense_h_to_4h.bias', \
+            'mlp.dense_4h_to_h.weight', \
+            'mlp.dense_4h_to_h.bias', \
+            'post_attention_layernorm.weight', \
+            'post_attention_layernorm.bias', \
+            'input_layernorm.weight', \
+            'input_layernorm.bias'
+        )
+        for i in range(0, 2):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i],
+                       qkv=True,
+                       megatron_v2=self.policy.is_megatron_v2,
+                       split_qkv=self.policy.split_qkv)
+        for i in range(2, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(4, 10):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(10, 12):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+
+
+class BLOOMLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self,
+                 client_module,
+                 inference=True,
+                 use_load_prefix=True,
+                 split_qkv=False):
+        super().__init__(inference,
+                         linear_layer=True,
+                         use_load_prefix=use_load_prefix,
+                         split_qkv=split_qkv)
+        self.client_module = client_module
+        try:
+            import transformers
+            BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
+            global supported_models
+            supported_models.update(
+                {transformers.models.bloom.modeling_bloom.BloomModel})
+        except Exception as e:
+            print(
+                f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}"
+            )
+            BLOOMLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attention.hidden_size, \
+                self.client_module.self_attention.num_heads
+
+    def attention(self):
+        return self.client_module.self_attention.query_key_value.weight, \
+                self.client_module.self_attention.query_key_value.bias, \
+                self.client_module.self_attention.dense.weight, \
+                self.client_module.self_attention.dense.bias,
+
+    def mlp(self):
+        return self.client_module.mlp.dense_h_to_4h.weight, \
+               self.client_module.mlp.dense_h_to_4h.bias, \
+               self.client_module.mlp.dense_4h_to_h.weight, \
+               self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
--- a/deepspeed/module_inject/containers/clip.py
+++ b/deepspeed/module_inject/containers/clip.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_CLIPContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFCLIPLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=False):
+        super().__init__(inference, pre_attn_norm=True, scale_attention=True)
+        self.client_module = client_module
+        self.cuda_graph_supported = True
+
+        if HFCLIPLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFCLIPLayerPolicy._orig_layer_class = transformers.models.clip.modeling_clip.CLIPEncoderLayer
+            except:
+                HFCLIPLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.q_proj.weight.shape[1], \
+                self.client_module.self_attn.num_heads
+
+    def attention(self):
+        qw = self.client_module.self_attn.q_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+        kw = self.client_module.self_attn.k_proj.weight
+        kb = self.client_module.self_attn.k_proj.bias
+        vw = self.client_module.self_attn.v_proj.weight
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.self_attn.out_proj.weight, \
+               self.client_module.self_attn.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.fc1.weight, \
+               self.client_module.mlp.fc1.bias, \
+               self.client_module.mlp.fc2.weight, \
+               self.client_module.mlp.fc2.bias
+
+    def layernorm(self):
+        return self.client_module.layer_norm2.weight, \
+               self.client_module.layer_norm2.bias, \
+               self.client_module.layer_norm1.weight, \
+               self.client_module.layer_norm1.bias
--- a/deepspeed/module_inject/containers/distil_bert.py
+++ b/deepspeed/module_inject/containers/distil_bert.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+
+
+class DS_DistilBERTContainer(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+        self.triangular_masking = False
+        self.return_single_tuple = True
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFDistilBertLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=False, preln=False):
+        super().__init__(inference)
+        self.client_module = client_module
+        self.preln = preln
+        self.cuda_graph_supported = True
+        if HFDistilBertLayerPolicy._orig_layer_class is None:
+            try:
+                import transformers
+                HFDistilBertLayerPolicy._orig_layer_class = [
+                    transformers.models.distilbert.modeling_distilbert.TransformerBlock,
+                ]
+            except:
+                HFDistilBertLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.q_lin.weight.shape[1], \
+                self.client_module.attention.n_heads
+
+    def attention(self):
+        qw = self.client_module.attention.q_lin.weight
+        qb = self.client_module.attention.q_lin.bias
+        kw = self.client_module.attention.k_lin.weight
+        kb = self.client_module.attention.k_lin.bias
+        vw = self.client_module.attention.v_lin.weight
+        vb = self.client_module.attention.v_lin.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0))
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0))
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.attention.out_lin.weight, \
+               self.client_module.attention.out_lin.bias
+
+    def mlp(self):
+        intermediate_ff = self.client_module.ffn.lin1
+
+        return intermediate_ff.weight, intermediate_ff.bias, \
+            self.client_module.ffn.lin2.weight, \
+            self.client_module.ffn.lin2.bias
+
+    def layernorm(self):
+        attention_layernorm = self.client_module.sa_layer_norm
+        transformer_layernorm = self.client_module.output_layer_norm
+        return attention_layernorm.weight, \
+               attention_layernorm.bias, \
+               transformer_layernorm.weight, \
+               transformer_layernorm.bias
--- a/deepspeed/module_inject/containers/features/__init__.py
+++ b/deepspeed/module_inject/containers/features/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .megatron import MegatronContainer
+from .meta_tensor import MetaTensorContainer
--- a/deepspeed/module_inject/containers/features/megatron.py
+++ b/deepspeed/module_inject/containers/features/megatron.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from abc import ABC
+
+
+class MegatronContainer(ABC):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.megatron_v2 = self.policy.is_megatron_v2
+
+    def transpose_qkv_alignment(self, x):
+        attention_head_size = x.shape[-1] // self.num_attention_heads
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
+        x_1 = x.view(*new_x_shape)
+        (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
+        if len(q.shape) > 2:
+            return torch.cat((q.reshape(q.shape[0],
+                                        -1),
+                              k.reshape(q.shape[0],
+                                        -1),
+                              v.reshape(q.shape[0],
+                                        -1)),
+                             dim=-1).reshape(x.shape)
+        else:
+            return torch.cat((q.reshape(-1),
+                              k.reshape(-1),
+                              v.reshape(-1)),
+                             dim=-1).reshape(x.shape)
+
+    def transpose(self):
+        super().transpose()
+        if self.megatron_v2:
+            self.qkvw = torch.nn.parameter.Parameter(
+                self.transpose_qkv_alignment(self.qkvw).contiguous())
+            self.qkvb = torch.nn.parameter.Parameter(
+                self.transpose_qkv_alignment(self.qkvb).contiguous())
--- a/deepspeed/module_inject/containers/features/meta_tensor.py
+++ b/deepspeed/module_inject/containers/features/meta_tensor.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from abc import ABC, abstractmethod
+
+
+class MetaTensorContainer(ABC):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.is_meta = False
+        self.ckpt_load_enabled = True
+
+    def initialize_tensors(self):
+        super().initialize_tensors()
+        self.is_meta = self.qkvw.is_meta
+
+    def apply_tensor_parallelism(self, mp_replace):
+        if self.is_meta:
+            if self.qkvb is None:
+                self.module.attention.attn_qkvb = None
+            if self.dense_b is None:
+                self.module.attention.attn_ob = None
+        else:
+            super().apply_tensor_parallelism(mp_replace)
+
+    def copy_data_to_new_module(self):
+        if self.is_meta:
+            if self.attn_nw is None:
+                self.module.mlp.attn_nw = self.attn_nw
+                self.module.mlp.attn_nb = self.attn_nb
+        else:
+            super().copy_data_to_new_module()
+
+    def transpose(self):
+        if not self.is_meta:
+            super().transpose()
+
+    @abstractmethod
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        """
+        Load all the transformer parameter from the checkpoint file (sd).
+        In addition to the parameter names, we require two
+        more parameters to help read the the data correctly
+        from the checkpoint and split the qkv heads in the
+        right order:
+            1. `use_load_prefix` (Default: False): this specifies
+                whether we need to use the name of first abstraction
+                layer of the model for searching the parameter's name
+                in a checkpoint file. For more information of how this
+                is used please see
+                https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
+            2. `split_qkv` (Default: True): we use this flag when splitting
+                the qkv parameter into heads. If it is False, it means the heads
+                of q, k, and v are stored together and needs to split in the
+                DeepSpeed-Inference API.
+        """
+        raise NotImplementedError(
+            "A load_params() function must be defined in the model container \
+                                  when inheriting the MetaTensorContainer feature")
--- a/deepspeed/module_inject/containers/gpt2.py
+++ b/deepspeed/module_inject/containers/gpt2.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from ..policy import TransformerPolicy
+
+
+class DS_GPT2Container(BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+
+class HFGPT2LayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        # HuggingFace GPT2 uses convolutional layer instead of linear layer
+        super().__init__(inference, linear_layer=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
+        except:
+            HFGPT2LayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.embed_dim, \
+                self.client_module.attn.num_heads
+
+    def attention(self):
+        return  self.client_module.attn.c_attn.weight, \
+                self.client_module.attn.c_attn.bias, \
+                self.client_module.attn.c_proj.weight, \
+                self.client_module.attn.c_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.c_fc.weight, \
+               self.client_module.mlp.c_fc.bias, \
+               self.client_module.mlp.c_proj.weight, \
+               self.client_module.mlp.c_proj.bias
+
+    def layernorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
--- a/deepspeed/module_inject/containers/gptj.py
+++ b/deepspeed/module_inject/containers/gptj.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+
+
+class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attn.q_proj.weight', \
+            'attn.k_proj.weight', \
+            'attn.v_proj.weight', \
+            'attn.out_proj.weight', \
+            'mlp.fc_in.weight', \
+            'mlp.fc_in.bias', \
+            'mlp.fc_out.weight', \
+            'mlp.fc_out.bias', \
+            'ln_1.weight', \
+            'ln_1.bias'
+        )
+        maybe_copy_qkv(
+            module.attention,
+            sd,
+            weight_quantizer,
+            mp_replace,
+            'attn_qkvw',
+            [prefix + param_names[0],
+             prefix + param_names[1],
+             prefix + param_names[2]],
+            split_qkv=self.policy.split_qkv)
+        for i in range(3, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(4, 8):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(8, 10):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i + 2],
+                       prefix + param_names[i])
+
+
+class HFGPTJLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=True)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
+        except:
+            HFGPTJLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.q_proj.weight.shape[1], \
+                self.client_module.attn.num_attention_heads
+
+    def attention(self):
+        qw = self.client_module.attn.q_proj.weight
+        kw = self.client_module.attn.k_proj.weight
+        vw = self.client_module.attn.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return qkvw, \
+               None, \
+               self.client_module.attn.out_proj.weight, \
+               None,
+
+    def mlp(self):
+        return self.client_module.mlp.fc_in.weight, \
+               self.client_module.mlp.fc_in.bias, \
+               self.client_module.mlp.fc_out.weight, \
+               self.client_module.mlp.fc_out.bias
+
+    def layernorm(self):
+        return None, \
+               None, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
--- a/deepspeed/module_inject/containers/gptneo.py
+++ b/deepspeed/module_inject/containers/gptneo.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+
+
+class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attn.attention.q_proj.weight', \
+            'attn.attention.k_proj.weight', \
+            'attn.attention.v_proj.weight', \
+            'attn.attention.out_proj.weight', \
+            'attn.attention.out_proj.bias', \
+            'mlp.c_fc.weight', \
+            'mlp.c_fc.bias', \
+            'mlp.c_proj.weight', \
+            'mlp.c_proj.bias', \
+            'ln_2.weight', \
+            'ln_2.bias', \
+            'ln_1.weight', \
+            'ln_1.bias'
+        )
+        maybe_copy_qkv(
+            module.attention,
+            sd,
+            weight_quantizer,
+            mp_replace,
+            'attn_qkvw',
+            [prefix + param_names[0],
+             prefix + param_names[1],
+             prefix + param_names[2]],
+            split_qkv=self.policy.split_qkv)
+        for i in range(3, 5):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(5, 11):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+        for i in range(11, 13):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 1],
+                       prefix + param_names[i])
+
+
+class HFGPTNEOLayerPolicy(TransformerPolicy):
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, scale_attention=False)
+        self.client_module = client_module
+        try:
+            import transformers
+            HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
+        except:
+            HFGPTNEOLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attn.attention.q_proj.weight.shape[1], \
+                self.client_module.attn.attention.num_heads
+
+    def attention(self):
+        qw = self.client_module.attn.attention.q_proj.weight
+        kw = self.client_module.attn.attention.k_proj.weight
+        vw = self.client_module.attn.attention.v_proj.weight
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+        return qkvw, \
+               None, \
+               self.client_module.attn.attention.out_proj.weight, \
+               self.client_module.attn.attention.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.mlp.c_fc.weight, \
+               self.client_module.mlp.c_fc.bias, \
+               self.client_module.mlp.c_proj.weight, \
+               self.client_module.mlp.c_proj.bias
+
+    def layernorm(self):
+        return self.client_module.ln_2.weight, \
+               self.client_module.ln_2.bias, \
+               self.client_module.ln_1.weight, \
+               self.client_module.ln_1.bias
--- a/deepspeed/module_inject/containers/gptneox.py
+++ b/deepspeed/module_inject/containers/gptneox.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+import torch
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from packaging import version as pkg_version
+
+
+class DS_GPTNEOXContainer(MetaTensorContainer,
+                          MegatronContainer,
+                          BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'attention.query_key_value.weight', \
+            'attention.query_key_value.bias', \
+            'attention.dense.weight', \
+            'attention.dense.bias', \
+            'mlp.dense_h_to_4h.weight', \
+            'mlp.dense_h_to_4h.bias', \
+            'mlp.dense_4h_to_h.weight', \
+            'mlp.dense_4h_to_h.bias', \
+            'post_attention_layernorm.weight', \
+            'post_attention_layernorm.bias', \
+            'input_layernorm.weight', \
+            'input_layernorm.bias'
+        )
+        for i in range(0, 2):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i],
+                       qkv=True,
+                       megatron_v2=self.policy.is_megatron_v2,
+                       split_qkv=self.policy.split_qkv,
+                       heads=self.policy.client_module.attention.num_attention_heads)
+        for i in range(2, 4):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(4, 10):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+        for i in range(10, 12):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i],
+                       prefix + param_names[i])
+
+
+class GPTNEOXLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+    version = 0
+
+    def __init__(self, client_module, inference=True, megatron_v2=True, split_qkv=False):
+        super().__init__(inference, megatron_v2=megatron_v2, split_qkv=split_qkv)
+        self.client_module = client_module
+        if GPTNEOXLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                GPTNEOXLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from transformers import GPTNeoXLayer
+                    GPTNEOXLayerPolicy._orig_layer_class = GPTNeoXLayer
+                except ImportError:
+                    GPTNEOXLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if GPTNEOXLayerPolicy.version == 0:
+            attention = self.client_module.attention
+        else:
+            attention = self.client_module.self_attention
+
+        return attention.query_key_value.weight, \
+               attention.query_key_value.bias, \
+               attention.dense.weight, \
+               attention.dense.bias
+
+    def mlp(self):
+        return self.client_module.mlp.dense_h_to_4h.weight, \
+               self.client_module.mlp.dense_h_to_4h.bias, \
+               self.client_module.mlp.dense_4h_to_h.weight, \
+               self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
--- a/deepspeed/module_inject/containers/megatron_gpt.py
+++ b/deepspeed/module_inject/containers/megatron_gpt.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+import torch
+from ..policy import TransformerPolicy
+from packaging import version as pkg_version
+
+
+class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+
+# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
+# TODO: Generalize MoE overall goal, expand beyond Megatron
+class MegatronLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+    version = 0
+    moe_type = 'standard'
+    megatron_v2 = True
+    use_mup = False
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference,
+                         megatron_v2=MegatronLayerPolicy.megatron_v2,
+                         use_mup=MegatronLayerPolicy.use_mup)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        if MegatronLayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                MegatronLayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from megatron.model.transformer import ParallelTransformerLayer
+                    MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
+                except ImportError:
+                    MegatronLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.attention.query_key_value.weight.shape[1], \
+                self.client_module.attention.num_attention_heads
+
+    def attention(self):
+        if self.inference:
+            if MegatronLayerPolicy.version == 0:
+                attention = self.client_module.attention
+            else:
+                attention = self.client_module.self_attention
+
+        return attention.query_key_value.weight, \
+               attention.query_key_value.bias, \
+               attention.dense.weight, \
+               attention.dense.bias
+
+    def mlp(self, moe_type='standard'):
+        from deepspeed.moe.utils import has_moe_layers
+        moe, _ = has_moe_layers(self.client_module)
+
+        if moe:
+            moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
+                            self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
+            num_experts = len(moe_experts)
+            if moe_type == 'standard':
+                return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
+            else:
+
+                return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                       [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
+                       self.client_module.mlp.mlp.dense_h_to_4h.weight, \
+                       self.client_module.mlp.mlp.dense_h_to_4h.bias, \
+                       self.client_module.mlp.mlp.dense_4h_to_h.weight, \
+                       self.client_module.mlp.mlp.dense_4h_to_h.bias, \
+                       self.client_module.mlp.coefficient.weight
+
+        else:
+            return self.client_module.mlp.dense_h_to_4h.weight, \
+                   self.client_module.mlp.dense_h_to_4h.bias, \
+                   self.client_module.mlp.dense_4h_to_h.weight, \
+                   self.client_module.mlp.dense_4h_to_h.bias
+
+    def layernorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
--- a/deepspeed/module_inject/containers/megatron_gpt_moe.py
+++ b/deepspeed/module_inject/containers/megatron_gpt_moe.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .base_moe import *
+from .features.megatron import MegatronContainer
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+import torch
+from .megatron_gpt import MegatronLayerPolicy
+from packaging import version as pkg_version
+
+
+class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
+    def __init__(self, policy, config, model_config, layer_id):
+        super().__init__(policy, config, model_config, layer_id)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+
+        if self.megatron_v2:
+            self.module.config.rotate_half = True
+            self.module.config.rotate_every_two = False
+
+        return self.module
+
+
+# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
+# TODO: Generalize MoE overall goal, expand beyond Megatron
+class MegatronMoELayerPolicy(MegatronLayerPolicy):
+    _orig_layer_class = None
+    version = 0
+    moe_type = 'standard'
+    num_experts = 1
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference)
+        self.client_module = client_module
+        # we use megatron version to differentiate between the old and new
+        # megatron-lm source code
+        if MegatronMoELayerPolicy._orig_layer_class is None:
+            if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
+                MegatronMoELayerPolicy._orig_layer_class = None
+            else:
+                try:
+                    from megatron.model.transformer import ParallelTransformerLayer
+                    MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
+                except ImportError:
+                    MegatronMoELayerPolicy._orig_layer_class = None
+
+    def get_num_experts(self):
+        return self.num_experts
+
+    def mlp(self, moe_type='standard'):
+        # for now, all of this is tightly coupled to megatron-deepspeed moe implementation
+        # todo: think and refactor this to be more general
+
+        #from deepspeed.moe.utils import has_moe_layers
+        #moe, _ = has_moe_layers(self.client_module)
+
+        moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
+                        self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
+        num_experts = len(moe_experts)
+        self.num_experts = num_experts
+
+        if moe_type == 'standard':
+            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
+        else:
+            return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
+                    [moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
+                    self.client_module.mlp.mlp.dense_h_to_4h.weight, \
+                    self.client_module.mlp.mlp.dense_h_to_4h.bias, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.weight, \
+                    self.client_module.mlp.mlp.dense_4h_to_h.bias, \
+                    self.client_module.mlp.coefficient.weight
--- a/deepspeed/module_inject/containers/opt.py
+++ b/deepspeed/module_inject/containers/opt.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .base import *
+from .features.meta_tensor import MetaTensorContainer
+from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+import torch
+from torch.nn.parameter import Parameter
+from ..policy import TransformerPolicy
+from ..policy import transformer_param_names
+from ..policy import maybe_copy
+from ..policy import maybe_copy_qkv
+from deepspeed.utils.types import ActivationFuncType
+
+
+class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # All model specific things should be defined here instead of the base class.
+
+    def create_module(self, config=None):
+        _config = config if config is not None else self.ds_model_config
+        self.module = DeepSpeedOPTInference(_config, mp_group=self.mp_group)
+        self.module.config.scale_attention = self.scale_attention
+        return self.module
+
+    def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
+        param_names = (
+            'self_attn.q_proj.weight', \
+            'self_attn.k_proj.weight', \
+            'self_attn.v_proj.weight', \
+            'self_attn.q_proj.bias', \
+            'self_attn.k_proj.bias', \
+            'self_attn.v_proj.bias', \
+            'self_attn.out_proj.weight', \
+            'self_attn.out_proj.bias', \
+            'fc1.weight', \
+            'fc1.bias', \
+            'fc2.weight', \
+            'fc2.bias', \
+            'final_layer_norm.weight', \
+            'final_layer_norm.bias', \
+            'self_attn_layer_norm.weight', \
+            'self_attn_layer_norm.bias'
+        )
+
+        for i in range(0, 6, 3):
+            maybe_copy_qkv(module.attention,
+                           sd,
+                           weight_quantizer,
+                           mp_replace,
+                           transformer_param_names[i // 3],
+                           [
+                               prefix + param_names[i],
+                               prefix + param_names[i + 1],
+                               prefix + param_names[i + 2]
+                           ],
+                           split_qkv=self.policy.split_qkv)
+        for i in range(6, 8):
+            maybe_copy(module.attention,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+        for i in range(8, 14):
+            maybe_copy(module.mlp,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+        for i in range(14, 16):
+            maybe_copy(module,
+                       sd,
+                       weight_quantizer,
+                       mp_replace,
+                       transformer_param_names[i - 4],
+                       prefix + param_names[i])
+
+
+class HFOPTLayerPolicy(TransformerPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True, use_load_prefix=True):
+        super().__init__(inference,
+                         linear_layer=True,
+                         mlp_act_func_type=ActivationFuncType.ReLU,
+                         pre_attn_norm=True,
+                         use_load_prefix=use_load_prefix)
+        self.client_module = client_module
+
+        try:
+            import transformers
+            HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
+            if isinstance(TransformerPolicy.hf_model_config,
+                          transformers.models.opt.configuration_opt.OPTConfig):
+                self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before
+        except:
+            HFOPTLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attn.embed_dim, \
+                self.client_module.self_attn.num_heads
+
+    def attention(self):
+        qw = self.client_module.self_attn.q_proj.weight
+        qb = self.client_module.self_attn.q_proj.bias
+
+        kw = self.client_module.self_attn.k_proj.weight
+        kb = self.client_module.self_attn.k_proj.bias
+
+        vw = self.client_module.self_attn.v_proj.weight
+        vb = self.client_module.self_attn.v_proj.bias
+
+        qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+        qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
+
+        return qkvw, \
+               qkvb, \
+               self.client_module.self_attn.out_proj.weight, \
+               self.client_module.self_attn.out_proj.bias
+
+    def mlp(self):
+        return self.client_module.fc1.weight, \
+               self.client_module.fc1.bias, \
+               self.client_module.fc2.weight, \
+               self.client_module.fc2.bias
+
+    def layernorm(self):
+        return self.client_module.final_layer_norm.weight, \
+               self.client_module.final_layer_norm.bias, \
+               self.client_module.self_attn_layer_norm.weight, \
+               self.client_module.self_attn_layer_norm.bias
--- a/deepspeed/module_inject/containers/unet.py
+++ b/deepspeed/module_inject/containers/unet.py
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+import torch
+from torch.nn.parameter import Parameter
+
+from ..policy import DSPolicy
+from ...model_implementations.diffusers.unet import DSUNet
+
+
+class UNetPolicy(DSPolicy):
+    def __init__(self):
+        super().__init__()
+        try:
+            import diffusers
+            self._orig_layer_class = diffusers.models.unet_2d_condition.UNet2DConditionModel
+        except ImportError:
+            self._orig_layer_class = None
+
+    def match(self, module):
+        return isinstance(module, self._orig_layer_class)
+
+    def match_replaced(self, module):
+        return isinstance(module, DSUNet)
+
+    def apply(self, module, enable_cuda_graph=True):
+        # TODO(cmikeh2): Enable cuda graph should be an inference configuration
+        return DSUNet(module, enable_cuda_graph=enable_cuda_graph)
+
+    def attention(self, client_module):
+        qw = client_module.to_q.weight
+        kw = client_module.to_k.weight
+        vw = client_module.to_v.weight
+
+        if qw.shape[1] == kw.shape[1]:
+            qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
+
+            return qkvw, \
+                   client_module.to_out[0].weight, \
+                   client_module.to_out[0].bias, \
+                   qw.shape[-1], \
+                   client_module.heads
+        else:
+            #return None
+            #kvw = Parameter(torch.cat((kw, vw), dim=0), requires_grad=False)
+            return qw, \
+                   kw, vw, \
+                   client_module.to_out[0].weight, \
+                   client_module.to_out[0].bias, \
+                   qw.shape[-1], \
+                   client_module.heads
--- a/deepspeed/module_inject/containers/vae.py
+++ b/deepspeed/module_inject/containers/vae.py
+'''
+Copyright 2022 The Microsoft DeepSpeed Team
+'''
+from ..policy import DSPolicy
+from ...model_implementations.diffusers.vae import DSVAE
+
+
+class VAEPolicy(DSPolicy):
+    def __init__(self):
+        super().__init__()
+        try:
+            import diffusers
+            if hasattr(diffusers.models.vae, "AutoencoderKL"):
+                self._orig_layer_class = diffusers.models.vae.AutoencoderKL
+            else:
+                # Diffusers >= 0.12.0 changes location of AutoencoderKL
+                self._orig_layer_class = diffusers.models.autoencoder_kl.AutoencoderKL
+        except ImportError:
+            self._orig_layer_class = None
+
+    def match(self, module):
+        return isinstance(module, self._orig_layer_class)
+
+    def match_replaced(self, module):
+        return isinstance(module, DSVAE)
+
+    def apply(self, module, enable_cuda_graph=True):
+        # TODO(cmikeh2): Enable cuda graph should be an inference configuration
+        return DSVAE(module, enable_cuda_graph=enable_cuda_graph)
+
+    # NOTE (lekurile): Should we have a diffusers policy class?
+    def attention(self):
+        pass
--- a/deepspeed/module_inject/inject.py
+++ b/deepspeed/module_inject/inject.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
 import copy
 import torch
 from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig

--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from deepspeed import comm as dist
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn.parameter import Parameter
+from deepspeed.accelerator import get_accelerator
+
+
+class LinearAllreduce(nn.Module):
+    def __init__(self, weight, bias=None, mp_group=None):
+        super(LinearAllreduce, self).__init__()
+        self.weight = weight
+        self.bias = bias
+        self.mp_group = mp_group
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.mp_group is not None:
+            dist.all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class LinearLayer(nn.Module):
+    def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
+        super(LinearLayer, self).__init__()
+        if weight is not None:
+            self.weight = weight
+            self.bias = bias
+        else:
+            self.weight = Parameter(
+                torch.empty(weight_shape,
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name()))
+
+            self.bias = Parameter(
+                torch.empty(weight_shape[0],
+                            dtype=dtype,
+                            device=get_accelerator().current_device_name())) \
+                if bias is not None else None
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class Normalize(nn.Module):
+    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+        super(Normalize, self).__init__()
+        self.norm = nn.LayerNorm(dim,
+                                 eps=eps).to(dtype).to(
+                                     get_accelerator().current_device_name())
+        self.weight = self.norm.weight
+        self.bias = self.norm.bias
+
+    def forward(self, input):
+        return self.norm(input)
+
+
+class EmbeddingLayer(nn.Module):
+    def __init__(self, weight_shape, dtype=torch.half):
+        super(EmbeddingLayer, self).__init__()
+        self.weight = Parameter(
+            torch.empty(weight_shape[0],
+                        weight_shape[1],
+                        dtype=dtype,
+                        device=get_accelerator().current_device_name()))
+
+    def forward(self, input):
+        return F.embedding(input, self.weight)
+
+
+class OPTEmbedding(EmbeddingLayer):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, weight_shape):
+        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(weight_shape)
+
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        attention_mask = attention_mask.long()
+
+        # create positions depending on attention_mask
+        positions = (torch.cumsum(attention_mask,
+                                  dim=1).type_as(attention_mask) *
+                     attention_mask).long() - 1
+
+        # cut positions if `past_key_values_length` is > 0
+        positions = positions[:, past_key_values_length:]
+
+        return super().forward(positions + self.offset)
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from torch import nn
+from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
+from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
+from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
+from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
+from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
+
+import deepspeed.ops.transformer as transformer_inference
+from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+import torch
+import gc
+from deepspeed.accelerator import get_accelerator
+
+
+def load_model_with_checkpoint(r_module,
+                               sd,
+                               mp_replace,
+                               ckpt_type,
+                               ckpt_mp_size,
+                               weight_quantizer=None,
+                               rank=0,
+                               container=None):
+    error_msgs = []
+
+    def transpose(data):
+        with torch.no_grad():
+            data = data.contiguous()
+            data1 = data.transpose(-1, -2).reshape(-1)
+            data.reshape(-1).copy_(data1)
+            data1 = None
+        return data.reshape(data.shape[-1], data.shape[-2])
+
+    def load(module, prefix):
+        args = (sd[0], prefix, {}, True, [], [], error_msgs)
+
+        if hasattr(module, 'weight'):
+            module.weight = mp_replace.copy(module.weight.data, sd[0][prefix + 'weight'])
+        if prefix + 'bias' in sd[0].keys():
+            if module.bias.data.is_meta:
+                # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                module.bias = torch.nn.parameter.Parameter(
+                    data=torch.empty_like(module.bias.data,
+                                          device="cpu"),
+                    requires_grad=module.bias.data.requires_grad)
+            module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias'])
+        args = None
+        gc.collect()
+
+    def load_transformer_layer(module, prefix):
+        if ckpt_type == "tp":
+
+            def load_parameters(module, prefix):
+                for n, p in module.named_parameters():
+                    if prefix + n in sd[0] and len(n.split('.')) == 1:
+                        if type(sd[0][prefix + n]) is list:
+                            tmp_data, scale = sd[0][prefix + n]
+                            tmp_data = tmp_data
+                            scale = scale.to(get_accelerator().current_device_name())
+                            # set the quantizer number of groups using the checkpoint scale shape
+                            weight_quantizer.num_groups = scale.shape[0]
+                        else:
+                            tmp_data = sd[0][prefix + n].to(
+                                get_accelerator().current_device_name())
+                            scale = None
+                        src_shape = tmp_data.shape
+                        dst_shape = p.shape
+                        inner_dim = 1 if tmp_data.dtype == torch.int8 else 0
+                        outer_dim = 0 if tmp_data.dtype == torch.int8 else 1
+                        if (len(src_shape) == 2 and len(dst_shape) == 2):
+                            if (src_shape[inner_dim] == dst_shape[0]
+                                    and src_shape[outer_dim] == dst_shape[1]):
+                                if tmp_data.dtype != torch.int8:
+                                    p = weight_quantizer.quantize(
+                                        transpose(tmp_data) if weight_quantizer.
+                                        q_int8 else tmp_data)
+                                else:
+                                    p = torch.nn.parameter.Parameter(tmp_data,
+                                                                     requires_grad=False)
+                                    p.scale = scale
+                                setattr(module, n, p)
+                            else:
+                                dim = inner_dim if src_shape[inner_dim] != dst_shape[
+                                    0] else outer_dim
+                                dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1
+                                if src_shape[dim] > dst_shape[dim1]:
+                                    weight_partition = torch.split(
+                                        tmp_data,
+                                        dst_shape[dim1],
+                                        dim=dim)[rank].to(
+                                            get_accelerator().current_device_name())
+                                    assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \
+                                        '''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\
+                                           Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
+                                    scale = scale.view(
+                                        -1)[weight_quantizer.num_groups *
+                                            (rank + 1):].reshape(
+                                                weight_quantizer.num_groups,
+                                                -1).contiguous()
+                                else:
+                                    assert tmp_data.dtype != torch.int8, \
+                                        '''Merging of the checkpoints are not supported when using INT8 checkpoint! \
+                                          Please use a as many GPUs as TP-size for the checkpoint'''
+                                    all_data = [
+                                        sd[j][prefix +
+                                              n] if type(sd[j][prefix + n]) is list else
+                                        sd[j][prefix + n].to(
+                                            get_accelerator().current_device_name())
+                                        for j in range(len(sd))
+                                    ]
+                                    # Check if the weight tensor is for the QKV parameter
+                                    if src_shape[1] == (3 *
+                                                        src_shape[0]) // ckpt_mp_size:
+                                        qkv_size = src_shape[outer_dim] // 3
+                                        src_split = [
+                                            torch.split(src[0].data,
+                                                        qkv_size,
+                                                        dim=outer_dim)
+                                            for src in all_data
+                                        ]
+
+                                        weight_partition = torch.cat([
+                                            torch.cat([qkv_s[i] for qkv_s in src_split],
+                                                      axis=outer_dim)
+                                            for i in range(len(src_split[0]))
+                                        ],
+                                                                     dim=dim)
+                                    else:
+                                        weight_partition = torch.cat([
+                                            ad[0].to(
+                                                get_accelerator().current_device_name())
+                                            if type(ad) is list else ad
+                                            for ad in all_data
+                                        ],
+                                                                     dim=dim)
+                                    if tmp_data.dtype == torch.int8:
+                                        scale = torch.cat([
+                                            ad[1].to(
+                                                get_accelerator().current_device_name())
+                                            for ad in all_data
+                                        ],
+                                                          dim=dim)
+
+                                if tmp_data.dtype != torch.int8:
+                                    weight_partition = weight_quantizer.quantize(
+                                        transpose(weight_partition), \
+                                        parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \
+                                        weight_quantizer.quantize(weight_partition)
+                                else:
+                                    weight_partition = torch.nn.parameter.Parameter(
+                                        weight_partition,
+                                        requires_grad=False)
+                                    weight_partition.scale = scale
+                                setattr(module, n, weight_partition)
+                        else:
+                            if src_shape[0] == dst_shape[0]:
+                                p.data.copy_(tmp_data)
+                            else:
+                                if src_shape[0] > dst_shape[0]:
+                                    bias_split = torch.split(
+                                        tmp_data,
+                                        dst_shape[-1])[rank].to(get_accelerator(
+                                        ).current_device_name()).contiguous()
+                                    p.data.copy_(bias_split)
+                                else:
+                                    # Check if the weight tensor is for the QKV parameter
+                                    if src_shape[0] == (3 * r_module.config.hidden_size
+                                                        ) // ckpt_mp_size:
+                                        qkv_size = src_shape[0] // 3
+                                        src_split = [
+                                            torch.split(sd[j][prefix + n],
+                                                        qkv_size,
+                                                        dim=0) for j in range(len(sd))
+                                        ]
+
+                                        p.data.copy_(
+                                            torch.cat(
+                                                [
+                                                    torch.cat([
+                                                        qkv_s[i] for qkv_s in src_split
+                                                    ],
+                                                              axis=0)
+                                                    for i in range(len(src_split[0]))
+                                                ],
+                                                dim=0).to(get_accelerator(
+                                                ).current_device_name()).contiguous())
+                                    else:
+                                        p.data.copy_(
+                                            torch.cat(
+                                                [
+                                                    sd[j][prefix + n]
+                                                    for j in range(len(sd))
+                                                ],
+                                                dim=0).to(get_accelerator(
+                                                ).current_device_name()).contiguous())
+
+            load_parameters(module, prefix)
+            for n, child in module.named_children():
+                load_parameters(child, prefix + n + '.')
+        else:
+            container.load_params(module, sd[0], weight_quantizer, mp_replace, prefix)
+
+    try:
+        import transformers
+        OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
+    except:
+        OPTLearnedPositionalEmbedding = None
+    layer_policies = {
+        nn.Linear: load,
+        nn.Embedding: load,
+        nn.LayerNorm: load,
+        EmbeddingLayer: load,
+        LinearLayer: load,
+        Normalize: load,
+        transformer_inference.DeepSpeedTransformerInference: load_transformer_layer,
+        DeepSpeedBloomInference: load_transformer_layer,
+        DeepSpeedGPTInference: load_transformer_layer,
+        DeepSpeedBERTInference: load_transformer_layer,
+        DeepSpeedMegatronGPTInference: load_transformer_layer,
+        DeepSpeedOPTInference: load_transformer_layer,
+        OPTLearnedPositionalEmbedding: load,
+        OPTEmbedding: load
+    }
+
+    all_ds_ids = {}
+
+    def load_module_recursive(module, prefix='', level=0):
+        for name, child in module.named_children():
+            if child.__class__ in layer_policies:
+                checking_key = prefix + name + '.'
+                if not any(checking_key in item for item in sd[0].keys()):
+                    if hasattr(child, 'weight') and \
+                        (hasattr(child.weight, 'ds_id') and \
+                        child.weight.ds_id in all_ds_ids):
+                        prefix1 = all_ds_ids[child.weight.ds_id]
+                        if child.__class__ is nn.Linear:
+                            child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
+                            setattr(module, name, child)
+                    continue
+                child_params = list(child.parameters())
+                if len(child_params) > 0 and (child_params[0].numel() == 0
+                                              or child_params[0].is_meta):
+                    if child.weight.is_meta:
+                        ds_shape = child.weight.shape
+                    else:
+                        ds_shape = child.weight.ds_shape
+                    if child.__class__ is nn.LayerNorm:
+                        child = Normalize(dim=ds_shape[-1],
+                                          dtype=child.weight.dtype,
+                                          eps=child.eps)
+                        setattr(module, name, child)
+                    elif child.__class__ is nn.Linear:
+                        child = LinearLayer(weight_shape=child.weight.shape,
+                                            bias=child.bias)
+                        setattr(module, name, child)
+                    elif child.__class__ is OPTLearnedPositionalEmbedding:
+                        child = OPTEmbedding(weight_shape=ds_shape)
+                        setattr(module, name, child)
+                    else:
+                        ds_id = None
+                        if hasattr(child.weight, 'ds_id'):
+                            ds_id = child.weight.ds_id
+                        child = EmbeddingLayer(weight_shape=ds_shape,
+                                               dtype=child.weight.dtype)
+                        if ds_id is not None:
+                            all_ds_ids[ds_id] = child.weight
+                        setattr(module, name, child)
+                layer_policies[child.__class__](child, prefix + name + '.')
+            else:
+                load_module_recursive(
+                    child,
+                    prefix if (level == 0 and ckpt_type == 'pp') and container.policy.use_load_prefix else \
+                    prefix + name + '.',
+                    level + 1)
+
+    load_module_recursive(r_module)
+
+    embedding_weight = None
+
+    for n, p in r_module.named_parameters():
+        if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+            embedding_weight = p
+    if embedding_weight is not None and r_module.lm_head.weight.is_meta:
+        r_module.lm_head.weight = embedding_weight
+    for sd_ in sd:
+        del sd_
+    sd = None
+    gc.collect()