Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
'''Copyright The Microsoft DeepSpeed Team'''
# Create a container object to save model-specific tensors using the policy file above.
from .base import *
from deepspeed import comm as dist
import deepspeed.ops.transformer as transformer_inference
from deepspeed.accelerator import get_accelerator
class BaseTransformerMoEContainer(BaseTransformerContainer):
def __init__(self, **kwargs):
# Call the init function of the parent class to initialize the tensors and configs from parent class
super().__init__(**kwargs)
self.num_experts = self.policy.get_num_experts()
self.ep_world_size = dist.get_world_size()
self.local_ep_size = 1 if self.num_experts < self.ep_world_size else self.num_experts // self.ep_world_size
self.layer_norm_eps = self.config.layer_norm_eps if hasattr(
self.config,
'layer_norm_eps') else 1e-12,
# MoE models will have a list of mlp related tensors
self._h4h_w = []
self._h4h_b = []
self._4hh_w = []
self._4hh_b = []
# Residual MoE needs extra parameters
self._res_h4h_w = None
self._res_h4h_b = None
self._res_4hh_w = None
self._res_4hh_b = None
self._res_coef = None
def create_ds_model_config(self):
self.set_hidden_heads(*self.policy.get_hidden_heads())
assert self.num_attention_heads % self.mp_size == 0,\
"To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
"This is because the attention computation is partitioned evenly among the parallel GPUs."
self.ds_model_config = transformer_inference.DeepSpeedMoEInferenceConfig(
hidden_size=self.hidden_size,
heads=self.num_attention_heads,
layer_norm_eps=self.layer_norm_eps,
fp16=self.fp16,
pre_layer_norm=self.pre_layer_norm,
mp_size=self.mp_size,
q_int8=self.quantize,
moe_experts=self.local_ep_size,
global_experts=self.num_experts,
mlp_type=self.config.moe.type,
scale_attn_by_inverse_layer_idx=self.scale_attn_by_inverse_layer_idx,
)
return self.ds_model_config
def initialize_tensors(self):
# Set the tensors from policy (user module) to container (DS module)
self.set_attention(*self.policy.attention())
self.set_mlp(self.config.moe.type)
self.set_layernorm(*self.policy.layernorm())
def set_mlp(self, config_moe_type):
if config_moe_type == 'standard':
self._h4h_w, self._h4h_b, \
self._4hh_w, self._4hh_b = self.policy.mlp()
else:
self._h4h_w, self._h4h_b, self._4hh_w, \
self._4hh_b, self._res_h4h_w, self._res_h4h_b, \
self._res_4hh_w, self._res_4hh_b, \
self._res_coef = self.policy.mlp(config_moe_type)
def transpose(self):
self.transpose_attention()
self.transpose_mlp()
if self.config.moe.type == 'residual':
self.transpose_residual()
def transpose_mlp(self):
self._h4h_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._h4h_w]
self._4hh_w = [self.transpose_impl(moe_w1.data) for moe_w1 in self._4hh_w]
def transpose_residual(self):
self._res_h4h_w.data = self.transpose_impl(self._res_h4h_w.data)
self._res_4hh_w.data = self.transpose_impl(self._res_4hh_w.data)
self._res_coef.data = self.transpose_impl(self._res_coef.data)
def apply_tensor_parallelism(self, mp_replace):
# setup the new Attention module
self.attention_qkv_mp(mp_replace)
self.attention_o_mp(mp_replace)
# quantize attention weights
self.attention_quantization()
# setup the new MLP module
self.mlp_mp()
def mlp_mp(self):
gpu_index = dist.get_rank()
for ep_index in range(self.local_ep_size):
# mlp inter
self.module.mlp[ep_index].inter_w.data = self._h4h_w[
gpu_index * self.local_ep_size + ep_index].to(
get_accelerator().current_device_name())
self.module.mlp[ep_index].inter_b.data = self._h4h_b[
gpu_index * self.local_ep_size + ep_index].to(
get_accelerator().current_device_name())
# mlp output
self.module.mlp[ep_index].output_w.data = self._4hh_w[
gpu_index * self.local_ep_size + ep_index].to(
get_accelerator().current_device_name())
self.module.mlp[ep_index].output_b.data = self._4hh_b[
gpu_index * self.local_ep_size + ep_index].to(
get_accelerator().current_device_name())
def copy_data_to_new_module(self):
self.module.attn_nw.data = self.attn_nw.to(
get_accelerator().current_device_name())
self.module.attn_nb.data = self.attn_nb.to(
get_accelerator().current_device_name())
self.module.norm_w.data.copy_(
self.input_nw.to(get_accelerator().current_device_name()))
self.module.norm_b.data.copy_(
self.input_nb.to(get_accelerator().current_device_name()))
if self.config.moe.type == 'residual':
self.module.res_mlp.inter_w.data = self._res_h4h_w.to(
get_accelerator().current_device_name())
self.module.res_mlp.inter_b.data = self._res_h4h_b.to(
get_accelerator().current_device_name())
self.module.res_mlp.output_w.data = self._res_4hh_w.to(
get_accelerator().current_device_name())
self.module.res_mlp.output_b.data = self._res_4hh_b.to(
get_accelerator().current_device_name())
self.module.res_coef.data = self._res_coef.to(
get_accelerator().current_device_name())
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
class DS_BERTContainer(BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
self.return_tuple = True
self.triangular_masking = False
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
class HFBertLayerPolicy(TransformerPolicy):
def __init__(self, client_module, inference=False):
super().__init__(inference, pre_attn_norm=False)
self.client_module = client_module
self.cuda_graph_supported = True
if HFBertLayerPolicy._orig_layer_class is None:
try:
import transformers
HFBertLayerPolicy._orig_layer_class = [
transformers.models.bert.modeling_bert.BertLayer,
transformers.models.roberta.modeling_roberta.RobertaLayer
]
except:
HFBertLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attention.self.query.weight.shape[1], \
self.client_module.attention.self.num_attention_heads
def attention(self):
qw = self.client_module.attention.self.query.weight
qb = self.client_module.attention.self.query.bias
kw = self.client_module.attention.self.key.weight
kb = self.client_module.attention.self.key.bias
vw = self.client_module.attention.self.value.weight
vb = self.client_module.attention.self.value.bias
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
return qkvw, \
qkvb, \
self.client_module.attention.output.dense.weight, \
self.client_module.attention.output.dense.bias, \
def mlp(self):
if self.pre_attn_norm:
intermediate_ff = self.client_module.intermediate.dense_act
else:
intermediate_ff = self.client_module.intermediate.dense
return intermediate_ff.weight, intermediate_ff.bias, \
self.client_module.output.dense.weight, \
self.client_module.output.dense.bias
def layernorm(self):
if self.pre_attn_norm:
attention_layernorm = self.client_module.PostAttentionLayerNorm
transformer_layernorm = self.client_module.PreAttentionLayerNorm
else:
attention_layernorm = self.client_module.attention.output.LayerNorm
transformer_layernorm = self.client_module.output.LayerNorm
return attention_layernorm.weight, \
attention_layernorm.bias, \
transformer_layernorm.weight, \
transformer_layernorm.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.meta_tensor import MetaTensorContainer
from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
from ..policy import TransformerPolicy
from ..policy import transformer_param_names
from ..policy import maybe_copy
supported_models = {None}
class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
self.bigscience_bloom = True
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedBloomInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
def attention_qkv_mp(self, mp_replace):
self.module.attention.attn_qkvw = mp_replace.copy(
self.module.attention.attn_qkvw,
self.qkvw)
self.module.attention.attn_qkvb = mp_replace.copy(
self.module.attention.attn_qkvb,
self.qkvb)
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
param_names = (
'self_attention.query_key_value.weight', \
'self_attention.query_key_value.bias', \
'self_attention.dense.weight', \
'self_attention.dense.bias', \
'mlp.dense_h_to_4h.weight', \
'mlp.dense_h_to_4h.bias', \
'mlp.dense_4h_to_h.weight', \
'mlp.dense_4h_to_h.bias', \
'post_attention_layernorm.weight', \
'post_attention_layernorm.bias', \
'input_layernorm.weight', \
'input_layernorm.bias'
)
for i in range(0, 2):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i],
qkv=True,
megatron_v2=self.policy.is_megatron_v2,
split_qkv=self.policy.split_qkv)
for i in range(2, 4):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
for i in range(4, 10):
maybe_copy(module.mlp,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
for i in range(10, 12):
maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
class BLOOMLayerPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self,
client_module,
inference=True,
use_load_prefix=True,
split_qkv=False):
super().__init__(inference,
linear_layer=True,
use_load_prefix=use_load_prefix,
split_qkv=split_qkv)
self.client_module = client_module
try:
import transformers
BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
global supported_models
supported_models.update(
{transformers.models.bloom.modeling_bloom.BloomModel})
except Exception as e:
print(
f"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception: {e}"
)
BLOOMLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.self_attention.hidden_size, \
self.client_module.self_attention.num_heads
def attention(self):
return self.client_module.self_attention.query_key_value.weight, \
self.client_module.self_attention.query_key_value.bias, \
self.client_module.self_attention.dense.weight, \
self.client_module.self_attention.dense.bias,
def mlp(self):
return self.client_module.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.dense_4h_to_h.bias
def layernorm(self):
return self.client_module.post_attention_layernorm.weight, \
self.client_module.post_attention_layernorm.bias, \
self.client_module.input_layernorm.weight, \
self.client_module.input_layernorm.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
class DS_CLIPContainer(BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
class HFCLIPLayerPolicy(TransformerPolicy):
def __init__(self, client_module, inference=False):
super().__init__(inference, pre_attn_norm=True, scale_attention=True)
self.client_module = client_module
self.cuda_graph_supported = True
if HFCLIPLayerPolicy._orig_layer_class is None:
try:
import transformers
HFCLIPLayerPolicy._orig_layer_class = transformers.models.clip.modeling_clip.CLIPEncoderLayer
except:
HFCLIPLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.self_attn.q_proj.weight.shape[1], \
self.client_module.self_attn.num_heads
def attention(self):
qw = self.client_module.self_attn.q_proj.weight
qb = self.client_module.self_attn.q_proj.bias
kw = self.client_module.self_attn.k_proj.weight
kb = self.client_module.self_attn.k_proj.bias
vw = self.client_module.self_attn.v_proj.weight
vb = self.client_module.self_attn.v_proj.bias
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
return qkvw, \
qkvb, \
self.client_module.self_attn.out_proj.weight, \
self.client_module.self_attn.out_proj.bias
def mlp(self):
return self.client_module.mlp.fc1.weight, \
self.client_module.mlp.fc1.bias, \
self.client_module.mlp.fc2.weight, \
self.client_module.mlp.fc2.bias
def layernorm(self):
return self.client_module.layer_norm2.weight, \
self.client_module.layer_norm2.bias, \
self.client_module.layer_norm1.weight, \
self.client_module.layer_norm1.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
class DS_DistilBERTContainer(BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
self.triangular_masking = False
self.return_single_tuple = True
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedBERTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
class HFDistilBertLayerPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=False, preln=False):
super().__init__(inference)
self.client_module = client_module
self.preln = preln
self.cuda_graph_supported = True
if HFDistilBertLayerPolicy._orig_layer_class is None:
try:
import transformers
HFDistilBertLayerPolicy._orig_layer_class = [
transformers.models.distilbert.modeling_distilbert.TransformerBlock,
]
except:
HFDistilBertLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attention.q_lin.weight.shape[1], \
self.client_module.attention.n_heads
def attention(self):
qw = self.client_module.attention.q_lin.weight
qb = self.client_module.attention.q_lin.bias
kw = self.client_module.attention.k_lin.weight
kb = self.client_module.attention.k_lin.bias
vw = self.client_module.attention.v_lin.weight
vb = self.client_module.attention.v_lin.bias
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0))
qkvb = Parameter(torch.cat((qb, kb, vb), dim=0))
return qkvw, \
qkvb, \
self.client_module.attention.out_lin.weight, \
self.client_module.attention.out_lin.bias
def mlp(self):
intermediate_ff = self.client_module.ffn.lin1
return intermediate_ff.weight, intermediate_ff.bias, \
self.client_module.ffn.lin2.weight, \
self.client_module.ffn.lin2.bias
def layernorm(self):
attention_layernorm = self.client_module.sa_layer_norm
transformer_layernorm = self.client_module.output_layer_norm
return attention_layernorm.weight, \
attention_layernorm.bias, \
transformer_layernorm.weight, \
transformer_layernorm.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .megatron import MegatronContainer
from .meta_tensor import MetaTensorContainer
'''Copyright The Microsoft DeepSpeed Team'''
import torch
from abc import ABC
class MegatronContainer(ABC):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.megatron_v2 = self.policy.is_megatron_v2
def transpose_qkv_alignment(self, x):
attention_head_size = x.shape[-1] // self.num_attention_heads
new_x_shape = x.size()[:-1] + (self.num_attention_heads, attention_head_size)
x_1 = x.view(*new_x_shape)
(q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
if len(q.shape) > 2:
return torch.cat((q.reshape(q.shape[0],
-1),
k.reshape(q.shape[0],
-1),
v.reshape(q.shape[0],
-1)),
dim=-1).reshape(x.shape)
else:
return torch.cat((q.reshape(-1),
k.reshape(-1),
v.reshape(-1)),
dim=-1).reshape(x.shape)
def transpose(self):
super().transpose()
if self.megatron_v2:
self.qkvw = torch.nn.parameter.Parameter(
self.transpose_qkv_alignment(self.qkvw).contiguous())
self.qkvb = torch.nn.parameter.Parameter(
self.transpose_qkv_alignment(self.qkvb).contiguous())
'''Copyright The Microsoft DeepSpeed Team'''
from abc import ABC, abstractmethod
class MetaTensorContainer(ABC):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.is_meta = False
self.ckpt_load_enabled = True
def initialize_tensors(self):
super().initialize_tensors()
self.is_meta = self.qkvw.is_meta
def apply_tensor_parallelism(self, mp_replace):
if self.is_meta:
if self.qkvb is None:
self.module.attention.attn_qkvb = None
if self.dense_b is None:
self.module.attention.attn_ob = None
else:
super().apply_tensor_parallelism(mp_replace)
def copy_data_to_new_module(self):
if self.is_meta:
if self.attn_nw is None:
self.module.mlp.attn_nw = self.attn_nw
self.module.mlp.attn_nb = self.attn_nb
else:
super().copy_data_to_new_module()
def transpose(self):
if not self.is_meta:
super().transpose()
@abstractmethod
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
"""
Load all the transformer parameter from the checkpoint file (sd).
In addition to the parameter names, we require two
more parameters to help read the the data correctly
from the checkpoint and split the qkv heads in the
right order:
1. `use_load_prefix` (Default: False): this specifies
whether we need to use the name of first abstraction
layer of the model for searching the parameter's name
in a checkpoint file. For more information of how this
is used please see
https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
2. `split_qkv` (Default: True): we use this flag when splitting
the qkv parameter into heads. If it is False, it means the heads
of q, k, and v are stored together and needs to split in the
DeepSpeed-Inference API.
"""
raise NotImplementedError(
"A load_params() function must be defined in the model container \
when inheriting the MetaTensorContainer feature")
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
from ..policy import TransformerPolicy
class DS_GPT2Container(BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
class HFGPT2LayerPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True):
# HuggingFace GPT2 uses convolutional layer instead of linear layer
super().__init__(inference, linear_layer=False)
self.client_module = client_module
try:
import transformers
HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
except:
HFGPT2LayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.embed_dim, \
self.client_module.attn.num_heads
def attention(self):
return self.client_module.attn.c_attn.weight, \
self.client_module.attn.c_attn.bias, \
self.client_module.attn.c_proj.weight, \
self.client_module.attn.c_proj.bias
def mlp(self):
return self.client_module.mlp.c_fc.weight, \
self.client_module.mlp.c_fc.bias, \
self.client_module.mlp.c_proj.weight, \
self.client_module.mlp.c_proj.bias
def layernorm(self):
return self.client_module.ln_2.weight, \
self.client_module.ln_2.bias, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.meta_tensor import MetaTensorContainer
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
from ..policy import transformer_param_names
from ..policy import maybe_copy
from ..policy import maybe_copy_qkv
class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
param_names = (
'attn.q_proj.weight', \
'attn.k_proj.weight', \
'attn.v_proj.weight', \
'attn.out_proj.weight', \
'mlp.fc_in.weight', \
'mlp.fc_in.bias', \
'mlp.fc_out.weight', \
'mlp.fc_out.bias', \
'ln_1.weight', \
'ln_1.bias'
)
maybe_copy_qkv(
module.attention,
sd,
weight_quantizer,
mp_replace,
'attn_qkvw',
[prefix + param_names[0],
prefix + param_names[1],
prefix + param_names[2]],
split_qkv=self.policy.split_qkv)
for i in range(3, 4):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 1],
prefix + param_names[i])
for i in range(4, 8):
maybe_copy(module.mlp,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
for i in range(8, 10):
maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i + 2],
prefix + param_names[i])
class HFGPTJLayerPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True):
super().__init__(inference, scale_attention=True)
self.client_module = client_module
try:
import transformers
HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
except:
HFGPTJLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.q_proj.weight.shape[1], \
self.client_module.attn.num_attention_heads
def attention(self):
qw = self.client_module.attn.q_proj.weight
kw = self.client_module.attn.k_proj.weight
vw = self.client_module.attn.v_proj.weight
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
return qkvw, \
None, \
self.client_module.attn.out_proj.weight, \
None,
def mlp(self):
return self.client_module.mlp.fc_in.weight, \
self.client_module.mlp.fc_in.bias, \
self.client_module.mlp.fc_out.weight, \
self.client_module.mlp.fc_out.bias
def layernorm(self):
return None, \
None, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.meta_tensor import MetaTensorContainer
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
from ..policy import transformer_param_names
from ..policy import maybe_copy
from ..policy import maybe_copy_qkv
class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
param_names = (
'attn.attention.q_proj.weight', \
'attn.attention.k_proj.weight', \
'attn.attention.v_proj.weight', \
'attn.attention.out_proj.weight', \
'attn.attention.out_proj.bias', \
'mlp.c_fc.weight', \
'mlp.c_fc.bias', \
'mlp.c_proj.weight', \
'mlp.c_proj.bias', \
'ln_2.weight', \
'ln_2.bias', \
'ln_1.weight', \
'ln_1.bias'
)
maybe_copy_qkv(
module.attention,
sd,
weight_quantizer,
mp_replace,
'attn_qkvw',
[prefix + param_names[0],
prefix + param_names[1],
prefix + param_names[2]],
split_qkv=self.policy.split_qkv)
for i in range(3, 5):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 1],
prefix + param_names[i])
for i in range(5, 11):
maybe_copy(module.mlp,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 1],
prefix + param_names[i])
for i in range(11, 13):
maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 1],
prefix + param_names[i])
class HFGPTNEOLayerPolicy(TransformerPolicy):
def __init__(self, client_module, inference=True):
super().__init__(inference, scale_attention=False)
self.client_module = client_module
try:
import transformers
HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
except:
HFGPTNEOLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.attention.q_proj.weight.shape[1], \
self.client_module.attn.attention.num_heads
def attention(self):
qw = self.client_module.attn.attention.q_proj.weight
kw = self.client_module.attn.attention.k_proj.weight
vw = self.client_module.attn.attention.v_proj.weight
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
return qkvw, \
None, \
self.client_module.attn.attention.out_proj.weight, \
self.client_module.attn.attention.out_proj.bias
def mlp(self):
return self.client_module.mlp.c_fc.weight, \
self.client_module.mlp.c_fc.bias, \
self.client_module.mlp.c_proj.weight, \
self.client_module.mlp.c_proj.bias
def layernorm(self):
return self.client_module.ln_2.weight, \
self.client_module.ln_2.bias, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.meta_tensor import MetaTensorContainer
from .features.megatron import MegatronContainer
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
import torch
from ..policy import TransformerPolicy
from ..policy import transformer_param_names
from ..policy import maybe_copy
from packaging import version as pkg_version
class DS_GPTNEOXContainer(MetaTensorContainer,
MegatronContainer,
BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
if self.megatron_v2:
self.module.config.rotate_half = True
self.module.config.rotate_every_two = False
return self.module
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
param_names = (
'attention.query_key_value.weight', \
'attention.query_key_value.bias', \
'attention.dense.weight', \
'attention.dense.bias', \
'mlp.dense_h_to_4h.weight', \
'mlp.dense_h_to_4h.bias', \
'mlp.dense_4h_to_h.weight', \
'mlp.dense_4h_to_h.bias', \
'post_attention_layernorm.weight', \
'post_attention_layernorm.bias', \
'input_layernorm.weight', \
'input_layernorm.bias'
)
for i in range(0, 2):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i],
qkv=True,
megatron_v2=self.policy.is_megatron_v2,
split_qkv=self.policy.split_qkv,
heads=self.policy.client_module.attention.num_attention_heads)
for i in range(2, 4):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
for i in range(4, 10):
maybe_copy(module.mlp,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
for i in range(10, 12):
maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i],
prefix + param_names[i])
class GPTNEOXLayerPolicy(TransformerPolicy):
_orig_layer_class = None
version = 0
def __init__(self, client_module, inference=True, megatron_v2=True, split_qkv=False):
super().__init__(inference, megatron_v2=megatron_v2, split_qkv=split_qkv)
self.client_module = client_module
if GPTNEOXLayerPolicy._orig_layer_class is None:
if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
GPTNEOXLayerPolicy._orig_layer_class = None
else:
try:
from transformers import GPTNeoXLayer
GPTNEOXLayerPolicy._orig_layer_class = GPTNeoXLayer
except ImportError:
GPTNEOXLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
if GPTNEOXLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return self.client_module.attention.query_key_value.weight.shape[1], \
self.client_module.attention.num_attention_heads
def attention(self):
if GPTNEOXLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return attention.query_key_value.weight, \
attention.query_key_value.bias, \
attention.dense.weight, \
attention.dense.bias
def mlp(self):
return self.client_module.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.dense_4h_to_h.bias
def layernorm(self):
return self.client_module.post_attention_layernorm.weight, \
self.client_module.post_attention_layernorm.bias, \
self.client_module.input_layernorm.weight, \
self.client_module.input_layernorm.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.megatron import MegatronContainer
from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
import torch
from ..policy import TransformerPolicy
from packaging import version as pkg_version
class DS_MegatronGPTContainer(MegatronContainer, BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
if self.megatron_v2:
self.module.config.rotate_half = True
self.module.config.rotate_every_two = False
return self.module
# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
# TODO: Generalize MoE overall goal, expand beyond Megatron
class MegatronLayerPolicy(TransformerPolicy):
_orig_layer_class = None
version = 0
moe_type = 'standard'
megatron_v2 = True
use_mup = False
def __init__(self, client_module, inference=True):
super().__init__(inference,
megatron_v2=MegatronLayerPolicy.megatron_v2,
use_mup=MegatronLayerPolicy.use_mup)
self.client_module = client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
if MegatronLayerPolicy._orig_layer_class is None:
if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
MegatronLayerPolicy._orig_layer_class = None
else:
try:
from megatron.model.transformer import ParallelTransformerLayer
MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
except ImportError:
MegatronLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attention.query_key_value.weight.shape[1], \
self.client_module.attention.num_attention_heads
def attention(self):
if self.inference:
if MegatronLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return attention.query_key_value.weight, \
attention.query_key_value.bias, \
attention.dense.weight, \
attention.dense.bias
def mlp(self, moe_type='standard'):
from deepspeed.moe.utils import has_moe_layers
moe, _ = has_moe_layers(self.client_module)
if moe:
moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
num_experts = len(moe_experts)
if moe_type == 'standard':
return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
else:
return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
self.client_module.mlp.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.mlp.dense_4h_to_h.bias, \
self.client_module.mlp.coefficient.weight
else:
return self.client_module.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.dense_4h_to_h.bias
def layernorm(self):
return self.client_module.post_attention_layernorm.weight, \
self.client_module.post_attention_layernorm.bias, \
self.client_module.input_layernorm.weight, \
self.client_module.input_layernorm.bias
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .base_moe import *
from .features.megatron import MegatronContainer
from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
import torch
from .megatron_gpt import MegatronLayerPolicy
from packaging import version as pkg_version
class DS_MegatronGPTMoEContainer(MegatronContainer, BaseTransformerMoEContainer):
def __init__(self, policy, config, model_config, layer_id):
super().__init__(policy, config, model_config, layer_id)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedMegatronGPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
if self.megatron_v2:
self.module.config.rotate_half = True
self.module.config.rotate_every_two = False
return self.module
# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
# TODO: Generalize MoE overall goal, expand beyond Megatron
class MegatronMoELayerPolicy(MegatronLayerPolicy):
_orig_layer_class = None
version = 0
moe_type = 'standard'
num_experts = 1
def __init__(self, client_module, inference=True):
super().__init__(inference)
self.client_module = client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
if MegatronMoELayerPolicy._orig_layer_class is None:
if pkg_version.parse(torch.__version__) <= pkg_version.parse("1.2"):
MegatronMoELayerPolicy._orig_layer_class = None
else:
try:
from megatron.model.transformer import ParallelTransformerLayer
MegatronMoELayerPolicy._orig_layer_class = ParallelTransformerLayer
except ImportError:
MegatronMoELayerPolicy._orig_layer_class = None
def get_num_experts(self):
return self.num_experts
def mlp(self, moe_type='standard'):
# for now, all of this is tightly coupled to megatron-deepspeed moe implementation
# todo: think and refactor this to be more general
#from deepspeed.moe.utils import has_moe_layers
#moe, _ = has_moe_layers(self.client_module)
moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
num_experts = len(moe_experts)
self.num_experts = num_experts
if moe_type == 'standard':
return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
else:
return [moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
self.client_module.mlp.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.mlp.dense_4h_to_h.bias, \
self.client_module.mlp.coefficient.weight
'''Copyright The Microsoft DeepSpeed Team'''
from .base import *
from .features.meta_tensor import MetaTensorContainer
from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
import torch
from torch.nn.parameter import Parameter
from ..policy import TransformerPolicy
from ..policy import transformer_param_names
from ..policy import maybe_copy
from ..policy import maybe_copy_qkv
from deepspeed.utils.types import ActivationFuncType
class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
# All model specific things should be defined here instead of the base class.
def create_module(self, config=None):
_config = config if config is not None else self.ds_model_config
self.module = DeepSpeedOPTInference(_config, mp_group=self.mp_group)
self.module.config.scale_attention = self.scale_attention
return self.module
def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
param_names = (
'self_attn.q_proj.weight', \
'self_attn.k_proj.weight', \
'self_attn.v_proj.weight', \
'self_attn.q_proj.bias', \
'self_attn.k_proj.bias', \
'self_attn.v_proj.bias', \
'self_attn.out_proj.weight', \
'self_attn.out_proj.bias', \
'fc1.weight', \
'fc1.bias', \
'fc2.weight', \
'fc2.bias', \
'final_layer_norm.weight', \
'final_layer_norm.bias', \
'self_attn_layer_norm.weight', \
'self_attn_layer_norm.bias'
)
for i in range(0, 6, 3):
maybe_copy_qkv(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i // 3],
[
prefix + param_names[i],
prefix + param_names[i + 1],
prefix + param_names[i + 2]
],
split_qkv=self.policy.split_qkv)
for i in range(6, 8):
maybe_copy(module.attention,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 4],
prefix + param_names[i])
for i in range(8, 14):
maybe_copy(module.mlp,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 4],
prefix + param_names[i])
for i in range(14, 16):
maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
transformer_param_names[i - 4],
prefix + param_names[i])
class HFOPTLayerPolicy(TransformerPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True, use_load_prefix=True):
super().__init__(inference,
linear_layer=True,
mlp_act_func_type=ActivationFuncType.ReLU,
pre_attn_norm=True,
use_load_prefix=use_load_prefix)
self.client_module = client_module
try:
import transformers
HFOPTLayerPolicy._orig_layer_class = transformers.models.opt.modeling_opt.OPTDecoderLayer
if isinstance(TransformerPolicy.hf_model_config,
transformers.models.opt.configuration_opt.OPTConfig):
self.pre_attn_norm = TransformerPolicy.hf_model_config.do_layer_norm_before
except:
HFOPTLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.self_attn.embed_dim, \
self.client_module.self_attn.num_heads
def attention(self):
qw = self.client_module.self_attn.q_proj.weight
qb = self.client_module.self_attn.q_proj.bias
kw = self.client_module.self_attn.k_proj.weight
kb = self.client_module.self_attn.k_proj.bias
vw = self.client_module.self_attn.v_proj.weight
vb = self.client_module.self_attn.v_proj.bias
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
return qkvw, \
qkvb, \
self.client_module.self_attn.out_proj.weight, \
self.client_module.self_attn.out_proj.bias
def mlp(self):
return self.client_module.fc1.weight, \
self.client_module.fc1.bias, \
self.client_module.fc2.weight, \
self.client_module.fc2.bias
def layernorm(self):
return self.client_module.final_layer_norm.weight, \
self.client_module.final_layer_norm.bias, \
self.client_module.self_attn_layer_norm.weight, \
self.client_module.self_attn_layer_norm.bias
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
import torch
from torch.nn.parameter import Parameter
from ..policy import DSPolicy
from ...model_implementations.diffusers.unet import DSUNet
class UNetPolicy(DSPolicy):
def __init__(self):
super().__init__()
try:
import diffusers
self._orig_layer_class = diffusers.models.unet_2d_condition.UNet2DConditionModel
except ImportError:
self._orig_layer_class = None
def match(self, module):
return isinstance(module, self._orig_layer_class)
def match_replaced(self, module):
return isinstance(module, DSUNet)
def apply(self, module, enable_cuda_graph=True):
# TODO(cmikeh2): Enable cuda graph should be an inference configuration
return DSUNet(module, enable_cuda_graph=enable_cuda_graph)
def attention(self, client_module):
qw = client_module.to_q.weight
kw = client_module.to_k.weight
vw = client_module.to_v.weight
if qw.shape[1] == kw.shape[1]:
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
return qkvw, \
client_module.to_out[0].weight, \
client_module.to_out[0].bias, \
qw.shape[-1], \
client_module.heads
else:
#return None
#kvw = Parameter(torch.cat((kw, vw), dim=0), requires_grad=False)
return qw, \
kw, vw, \
client_module.to_out[0].weight, \
client_module.to_out[0].bias, \
qw.shape[-1], \
client_module.heads
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
from ..policy import DSPolicy
from ...model_implementations.diffusers.vae import DSVAE
class VAEPolicy(DSPolicy):
def __init__(self):
super().__init__()
try:
import diffusers
if hasattr(diffusers.models.vae, "AutoencoderKL"):
self._orig_layer_class = diffusers.models.vae.AutoencoderKL
else:
# Diffusers >= 0.12.0 changes location of AutoencoderKL
self._orig_layer_class = diffusers.models.autoencoder_kl.AutoencoderKL
except ImportError:
self._orig_layer_class = None
def match(self, module):
return isinstance(module, self._orig_layer_class)
def match_replaced(self, module):
return isinstance(module, DSVAE)
def apply(self, module, enable_cuda_graph=True):
# TODO(cmikeh2): Enable cuda graph should be an inference configuration
return DSVAE(module, enable_cuda_graph=enable_cuda_graph)
# NOTE (lekurile): Should we have a diffusers policy class?
def attention(self):
pass
'''Copyright The Microsoft DeepSpeed Team'''
import copy
import torch
from deepspeed.ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
......
'''Copyright The Microsoft DeepSpeed Team'''
import torch
from deepspeed import comm as dist
from torch import nn
from torch.nn import functional as F
from torch.nn.parameter import Parameter
from deepspeed.accelerator import get_accelerator
class LinearAllreduce(nn.Module):
def __init__(self, weight, bias=None, mp_group=None):
super(LinearAllreduce, self).__init__()
self.weight = weight
self.bias = bias
self.mp_group = mp_group
def forward(self, input):
output = torch.matmul(input, self.weight.transpose(-1, -2))
if self.mp_group is not None:
dist.all_reduce(output, group=self.mp_group)
if self.bias is not None:
output += self.bias
return output
class LinearLayer(nn.Module):
def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
super(LinearLayer, self).__init__()
if weight is not None:
self.weight = weight
self.bias = bias
else:
self.weight = Parameter(
torch.empty(weight_shape,
dtype=dtype,
device=get_accelerator().current_device_name()))
self.bias = Parameter(
torch.empty(weight_shape[0],
dtype=dtype,
device=get_accelerator().current_device_name())) \
if bias is not None else None
def forward(self, input):
output = torch.matmul(input, self.weight.transpose(-1, -2))
if self.bias is not None:
output += self.bias
return output
class Normalize(nn.Module):
def __init__(self, dim, dtype=torch.float, eps=1e-5):
super(Normalize, self).__init__()
self.norm = nn.LayerNorm(dim,
eps=eps).to(dtype).to(
get_accelerator().current_device_name())
self.weight = self.norm.weight
self.bias = self.norm.bias
def forward(self, input):
return self.norm(input)
class EmbeddingLayer(nn.Module):
def __init__(self, weight_shape, dtype=torch.half):
super(EmbeddingLayer, self).__init__()
self.weight = Parameter(
torch.empty(weight_shape[0],
weight_shape[1],
dtype=dtype,
device=get_accelerator().current_device_name()))
def forward(self, input):
return F.embedding(input, self.weight)
class OPTEmbedding(EmbeddingLayer):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, weight_shape):
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(weight_shape)
def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask = attention_mask.long()
# create positions depending on attention_mask
positions = (torch.cumsum(attention_mask,
dim=1).type_as(attention_mask) *
attention_mask).long() - 1
# cut positions if `past_key_values_length` is > 0
positions = positions[:, past_key_values_length:]
return super().forward(positions + self.offset)
'''Copyright The Microsoft DeepSpeed Team'''
from torch import nn
from deepspeed.model_implementations.transformers.ds_bloom import DeepSpeedBloomInference
from deepspeed.model_implementations.transformers.ds_gpt import DeepSpeedGPTInference
from deepspeed.model_implementations.transformers.ds_bert import DeepSpeedBERTInference
from deepspeed.model_implementations.transformers.ds_megatron_gpt import DeepSpeedMegatronGPTInference
from deepspeed.model_implementations.transformers.ds_opt import DeepSpeedOPTInference
import deepspeed.ops.transformer as transformer_inference
from .layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
import torch
import gc
from deepspeed.accelerator import get_accelerator
def load_model_with_checkpoint(r_module,
sd,
mp_replace,
ckpt_type,
ckpt_mp_size,
weight_quantizer=None,
rank=0,
container=None):
error_msgs = []
def transpose(data):
with torch.no_grad():
data = data.contiguous()
data1 = data.transpose(-1, -2).reshape(-1)
data.reshape(-1).copy_(data1)
data1 = None
return data.reshape(data.shape[-1], data.shape[-2])
def load(module, prefix):
args = (sd[0], prefix, {}, True, [], [], error_msgs)
if hasattr(module, 'weight'):
module.weight = mp_replace.copy(module.weight.data, sd[0][prefix + 'weight'])
if prefix + 'bias' in sd[0].keys():
if module.bias.data.is_meta:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module.bias = torch.nn.parameter.Parameter(
data=torch.empty_like(module.bias.data,
device="cpu"),
requires_grad=module.bias.data.requires_grad)
module.bias = mp_replace.copy(module.bias.data, sd[0][prefix + 'bias'])
args = None
gc.collect()
def load_transformer_layer(module, prefix):
if ckpt_type == "tp":
def load_parameters(module, prefix):
for n, p in module.named_parameters():
if prefix + n in sd[0] and len(n.split('.')) == 1:
if type(sd[0][prefix + n]) is list:
tmp_data, scale = sd[0][prefix + n]
tmp_data = tmp_data
scale = scale.to(get_accelerator().current_device_name())
# set the quantizer number of groups using the checkpoint scale shape
weight_quantizer.num_groups = scale.shape[0]
else:
tmp_data = sd[0][prefix + n].to(
get_accelerator().current_device_name())
scale = None
src_shape = tmp_data.shape
dst_shape = p.shape
inner_dim = 1 if tmp_data.dtype == torch.int8 else 0
outer_dim = 0 if tmp_data.dtype == torch.int8 else 1
if (len(src_shape) == 2 and len(dst_shape) == 2):
if (src_shape[inner_dim] == dst_shape[0]
and src_shape[outer_dim] == dst_shape[1]):
if tmp_data.dtype != torch.int8:
p = weight_quantizer.quantize(
transpose(tmp_data) if weight_quantizer.
q_int8 else tmp_data)
else:
p = torch.nn.parameter.Parameter(tmp_data,
requires_grad=False)
p.scale = scale
setattr(module, n, p)
else:
dim = inner_dim if src_shape[inner_dim] != dst_shape[
0] else outer_dim
dim1 = 0 if src_shape[inner_dim] != dst_shape[0] else 1
if src_shape[dim] > dst_shape[dim1]:
weight_partition = torch.split(
tmp_data,
dst_shape[dim1],
dim=dim)[rank].to(
get_accelerator().current_device_name())
assert tmp_data.dtype != torch.int8 or scale.numel() > weight_quantizer.num_groups * (rank+1), \
'''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!\
Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
scale = scale.view(
-1)[weight_quantizer.num_groups *
(rank + 1):].reshape(
weight_quantizer.num_groups,
-1).contiguous()
else:
assert tmp_data.dtype != torch.int8, \
'''Merging of the checkpoints are not supported when using INT8 checkpoint! \
Please use a as many GPUs as TP-size for the checkpoint'''
all_data = [
sd[j][prefix +
n] if type(sd[j][prefix + n]) is list else
sd[j][prefix + n].to(
get_accelerator().current_device_name())
for j in range(len(sd))
]
# Check if the weight tensor is for the QKV parameter
if src_shape[1] == (3 *
src_shape[0]) // ckpt_mp_size:
qkv_size = src_shape[outer_dim] // 3
src_split = [
torch.split(src[0].data,
qkv_size,
dim=outer_dim)
for src in all_data
]
weight_partition = torch.cat([
torch.cat([qkv_s[i] for qkv_s in src_split],
axis=outer_dim)
for i in range(len(src_split[0]))
],
dim=dim)
else:
weight_partition = torch.cat([
ad[0].to(
get_accelerator().current_device_name())
if type(ad) is list else ad
for ad in all_data
],
dim=dim)
if tmp_data.dtype == torch.int8:
scale = torch.cat([
ad[1].to(
get_accelerator().current_device_name())
for ad in all_data
],
dim=dim)
if tmp_data.dtype != torch.int8:
weight_partition = weight_quantizer.quantize(
transpose(weight_partition), \
parallel_dim=(0 if dim == 1 else 1)) if weight_quantizer.q_int8 else \
weight_quantizer.quantize(weight_partition)
else:
weight_partition = torch.nn.parameter.Parameter(
weight_partition,
requires_grad=False)
weight_partition.scale = scale
setattr(module, n, weight_partition)
else:
if src_shape[0] == dst_shape[0]:
p.data.copy_(tmp_data)
else:
if src_shape[0] > dst_shape[0]:
bias_split = torch.split(
tmp_data,
dst_shape[-1])[rank].to(get_accelerator(
).current_device_name()).contiguous()
p.data.copy_(bias_split)
else:
# Check if the weight tensor is for the QKV parameter
if src_shape[0] == (3 * r_module.config.hidden_size
) // ckpt_mp_size:
qkv_size = src_shape[0] // 3
src_split = [
torch.split(sd[j][prefix + n],
qkv_size,
dim=0) for j in range(len(sd))
]
p.data.copy_(
torch.cat(
[
torch.cat([
qkv_s[i] for qkv_s in src_split
],
axis=0)
for i in range(len(src_split[0]))
],
dim=0).to(get_accelerator(
).current_device_name()).contiguous())
else:
p.data.copy_(
torch.cat(
[
sd[j][prefix + n]
for j in range(len(sd))
],
dim=0).to(get_accelerator(
).current_device_name()).contiguous())
load_parameters(module, prefix)
for n, child in module.named_children():
load_parameters(child, prefix + n + '.')
else:
container.load_params(module, sd[0], weight_quantizer, mp_replace, prefix)
try:
import transformers
OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
except:
OPTLearnedPositionalEmbedding = None
layer_policies = {
nn.Linear: load,
nn.Embedding: load,
nn.LayerNorm: load,
EmbeddingLayer: load,
LinearLayer: load,
Normalize: load,
transformer_inference.DeepSpeedTransformerInference: load_transformer_layer,
DeepSpeedBloomInference: load_transformer_layer,
DeepSpeedGPTInference: load_transformer_layer,
DeepSpeedBERTInference: load_transformer_layer,
DeepSpeedMegatronGPTInference: load_transformer_layer,
DeepSpeedOPTInference: load_transformer_layer,
OPTLearnedPositionalEmbedding: load,
OPTEmbedding: load
}
all_ds_ids = {}
def load_module_recursive(module, prefix='', level=0):
for name, child in module.named_children():
if child.__class__ in layer_policies:
checking_key = prefix + name + '.'
if not any(checking_key in item for item in sd[0].keys()):
if hasattr(child, 'weight') and \
(hasattr(child.weight, 'ds_id') and \
child.weight.ds_id in all_ds_ids):
prefix1 = all_ds_ids[child.weight.ds_id]
if child.__class__ is nn.Linear:
child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
setattr(module, name, child)
continue
child_params = list(child.parameters())
if len(child_params) > 0 and (child_params[0].numel() == 0
or child_params[0].is_meta):
if child.weight.is_meta:
ds_shape = child.weight.shape
else:
ds_shape = child.weight.ds_shape
if child.__class__ is nn.LayerNorm:
child = Normalize(dim=ds_shape[-1],
dtype=child.weight.dtype,
eps=child.eps)
setattr(module, name, child)
elif child.__class__ is nn.Linear:
child = LinearLayer(weight_shape=child.weight.shape,
bias=child.bias)
setattr(module, name, child)
elif child.__class__ is OPTLearnedPositionalEmbedding:
child = OPTEmbedding(weight_shape=ds_shape)
setattr(module, name, child)
else:
ds_id = None
if hasattr(child.weight, 'ds_id'):
ds_id = child.weight.ds_id
child = EmbeddingLayer(weight_shape=ds_shape,
dtype=child.weight.dtype)
if ds_id is not None:
all_ds_ids[ds_id] = child.weight
setattr(module, name, child)
layer_policies[child.__class__](child, prefix + name + '.')
else:
load_module_recursive(
child,
prefix if (level == 0 and ckpt_type == 'pp') and container.policy.use_load_prefix else \
prefix + name + '.',
level + 1)
load_module_recursive(r_module)
embedding_weight = None
for n, p in r_module.named_parameters():
if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
embedding_weight = p
if embedding_weight is not None and r_module.lm_head.weight.is_meta:
r_module.lm_head.weight = embedding_weight
for sd_ in sd:
del sd_
sd = None
gc.collect()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment