Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
67ea635f
Commit
67ea635f
authored
Mar 30, 2023
by
aiss
Browse files
push dsv0.8.2 version
parent
1b2721ad
Pipeline
#201
failed with stages
in 0 seconds
Changes
339
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1792 additions
and
0 deletions
+1792
-0
deepspeed/module_inject/containers/base_moe.py
deepspeed/module_inject/containers/base_moe.py
+141
-0
deepspeed/module_inject/containers/bert.py
deepspeed/module_inject/containers/bert.py
+81
-0
deepspeed/module_inject/containers/bloom.py
deepspeed/module_inject/containers/bloom.py
+128
-0
deepspeed/module_inject/containers/clip.py
deepspeed/module_inject/containers/clip.py
+66
-0
deepspeed/module_inject/containers/distil_bert.py
deepspeed/module_inject/containers/distil_bert.py
+75
-0
deepspeed/module_inject/containers/features/__init__.py
deepspeed/module_inject/containers/features/__init__.py
+4
-0
deepspeed/module_inject/containers/features/megatron.py
deepspeed/module_inject/containers/features/megatron.py
+37
-0
deepspeed/module_inject/containers/features/meta_tensor.py
deepspeed/module_inject/containers/features/meta_tensor.py
+58
-0
deepspeed/module_inject/containers/gpt2.py
deepspeed/module_inject/containers/gpt2.py
+54
-0
deepspeed/module_inject/containers/gptj.py
deepspeed/module_inject/containers/gptj.py
+110
-0
deepspeed/module_inject/containers/gptneo.py
deepspeed/module_inject/containers/gptneo.py
+111
-0
deepspeed/module_inject/containers/gptneox.py
deepspeed/module_inject/containers/gptneox.py
+129
-0
deepspeed/module_inject/containers/megatron_gpt.py
deepspeed/module_inject/containers/megatron_gpt.py
+106
-0
deepspeed/module_inject/containers/megatron_gpt_moe.py
deepspeed/module_inject/containers/megatron_gpt_moe.py
+82
-0
deepspeed/module_inject/containers/opt.py
deepspeed/module_inject/containers/opt.py
+134
-0
deepspeed/module_inject/containers/unet.py
deepspeed/module_inject/containers/unet.py
+51
-0
deepspeed/module_inject/containers/vae.py
deepspeed/module_inject/containers/vae.py
+33
-0
deepspeed/module_inject/inject.py
deepspeed/module_inject/inject.py
+2
-0
deepspeed/module_inject/layers.py
deepspeed/module_inject/layers.py
+101
-0
deepspeed/module_inject/load_checkpoint.py
deepspeed/module_inject/load_checkpoint.py
+289
-0
No files found.
Too many changes to show.
To preserve performance only
339 of 339+
files are displayed.
Plain diff
Email patch
deepspeed/module_inject/containers/base_moe.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
# Create a container object to save model-specific tensors using the policy file above.
from
.base
import
*
from
deepspeed
import
comm
as
dist
import
deepspeed.ops.transformer
as
transformer_inference
from
deepspeed.accelerator
import
get_accelerator
class
BaseTransformerMoEContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
# Call the init function of the parent class to initialize the tensors and configs from parent class
super
().
__init__
(
**
kwargs
)
self
.
num_experts
=
self
.
policy
.
get_num_experts
()
self
.
ep_world_size
=
dist
.
get_world_size
()
self
.
local_ep_size
=
1
if
self
.
num_experts
<
self
.
ep_world_size
else
self
.
num_experts
//
self
.
ep_world_size
self
.
layer_norm_eps
=
self
.
config
.
layer_norm_eps
if
hasattr
(
self
.
config
,
'layer_norm_eps'
)
else
1e-12
,
# MoE models will have a list of mlp related tensors
self
.
_h4h_w
=
[]
self
.
_h4h_b
=
[]
self
.
_4hh_w
=
[]
self
.
_4hh_b
=
[]
# Residual MoE needs extra parameters
self
.
_res_h4h_w
=
None
self
.
_res_h4h_b
=
None
self
.
_res_4hh_w
=
None
self
.
_res_4hh_b
=
None
self
.
_res_coef
=
None
def
create_ds_model_config
(
self
):
self
.
set_hidden_heads
(
*
self
.
policy
.
get_hidden_heads
())
assert
self
.
num_attention_heads
%
self
.
mp_size
==
0
,
\
"To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!"
+
\
"This is because the attention computation is partitioned evenly among the parallel GPUs."
self
.
ds_model_config
=
transformer_inference
.
DeepSpeedMoEInferenceConfig
(
hidden_size
=
self
.
hidden_size
,
heads
=
self
.
num_attention_heads
,
layer_norm_eps
=
self
.
layer_norm_eps
,
fp16
=
self
.
fp16
,
pre_layer_norm
=
self
.
pre_layer_norm
,
mp_size
=
self
.
mp_size
,
q_int8
=
self
.
quantize
,
moe_experts
=
self
.
local_ep_size
,
global_experts
=
self
.
num_experts
,
mlp_type
=
self
.
config
.
moe
.
type
,
scale_attn_by_inverse_layer_idx
=
self
.
scale_attn_by_inverse_layer_idx
,
)
return
self
.
ds_model_config
def
initialize_tensors
(
self
):
# Set the tensors from policy (user module) to container (DS module)
self
.
set_attention
(
*
self
.
policy
.
attention
())
self
.
set_mlp
(
self
.
config
.
moe
.
type
)
self
.
set_layernorm
(
*
self
.
policy
.
layernorm
())
def
set_mlp
(
self
,
config_moe_type
):
if
config_moe_type
==
'standard'
:
self
.
_h4h_w
,
self
.
_h4h_b
,
\
self
.
_4hh_w
,
self
.
_4hh_b
=
self
.
policy
.
mlp
()
else
:
self
.
_h4h_w
,
self
.
_h4h_b
,
self
.
_4hh_w
,
\
self
.
_4hh_b
,
self
.
_res_h4h_w
,
self
.
_res_h4h_b
,
\
self
.
_res_4hh_w
,
self
.
_res_4hh_b
,
\
self
.
_res_coef
=
self
.
policy
.
mlp
(
config_moe_type
)
def
transpose
(
self
):
self
.
transpose_attention
()
self
.
transpose_mlp
()
if
self
.
config
.
moe
.
type
==
'residual'
:
self
.
transpose_residual
()
def
transpose_mlp
(
self
):
self
.
_h4h_w
=
[
self
.
transpose_impl
(
moe_w1
.
data
)
for
moe_w1
in
self
.
_h4h_w
]
self
.
_4hh_w
=
[
self
.
transpose_impl
(
moe_w1
.
data
)
for
moe_w1
in
self
.
_4hh_w
]
def
transpose_residual
(
self
):
self
.
_res_h4h_w
.
data
=
self
.
transpose_impl
(
self
.
_res_h4h_w
.
data
)
self
.
_res_4hh_w
.
data
=
self
.
transpose_impl
(
self
.
_res_4hh_w
.
data
)
self
.
_res_coef
.
data
=
self
.
transpose_impl
(
self
.
_res_coef
.
data
)
def
apply_tensor_parallelism
(
self
,
mp_replace
):
# setup the new Attention module
self
.
attention_qkv_mp
(
mp_replace
)
self
.
attention_o_mp
(
mp_replace
)
# quantize attention weights
self
.
attention_quantization
()
# setup the new MLP module
self
.
mlp_mp
()
def
mlp_mp
(
self
):
gpu_index
=
dist
.
get_rank
()
for
ep_index
in
range
(
self
.
local_ep_size
):
# mlp inter
self
.
module
.
mlp
[
ep_index
].
inter_w
.
data
=
self
.
_h4h_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
inter_b
.
data
=
self
.
_h4h_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
# mlp output
self
.
module
.
mlp
[
ep_index
].
output_w
.
data
=
self
.
_4hh_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
output_b
.
data
=
self
.
_4hh_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
def
copy_data_to_new_module
(
self
):
self
.
module
.
attn_nw
.
data
=
self
.
attn_nw
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
attn_nb
.
data
=
self
.
attn_nb
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
norm_w
.
data
.
copy_
(
self
.
input_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_b
.
data
.
copy_
(
self
.
input_nb
.
to
(
get_accelerator
().
current_device_name
()))
if
self
.
config
.
moe
.
type
==
'residual'
:
self
.
module
.
res_mlp
.
inter_w
.
data
=
self
.
_res_h4h_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
inter_b
.
data
=
self
.
_res_h4h_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_w
.
data
=
self
.
_res_4hh_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_b
.
data
=
self
.
_res_4hh_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_coef
.
data
=
self
.
_res_coef
.
to
(
get_accelerator
().
current_device_name
())
deepspeed/module_inject/containers/bert.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_bert
import
DeepSpeedBERTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
class
DS_BERTContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
self
.
return_tuple
=
True
self
.
triangular_masking
=
False
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedBERTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
class
HFBertLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
False
):
super
().
__init__
(
inference
,
pre_attn_norm
=
False
)
self
.
client_module
=
client_module
self
.
cuda_graph_supported
=
True
if
HFBertLayerPolicy
.
_orig_layer_class
is
None
:
try
:
import
transformers
HFBertLayerPolicy
.
_orig_layer_class
=
[
transformers
.
models
.
bert
.
modeling_bert
.
BertLayer
,
transformers
.
models
.
roberta
.
modeling_roberta
.
RobertaLayer
]
except
:
HFBertLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attention
.
self
.
query
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
self
.
num_attention_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
attention
.
self
.
query
.
weight
qb
=
self
.
client_module
.
attention
.
self
.
query
.
bias
kw
=
self
.
client_module
.
attention
.
self
.
key
.
weight
kb
=
self
.
client_module
.
attention
.
self
.
key
.
bias
vw
=
self
.
client_module
.
attention
.
self
.
value
.
weight
vb
=
self
.
client_module
.
attention
.
self
.
value
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
qkvb
,
\
self
.
client_module
.
attention
.
output
.
dense
.
weight
,
\
self
.
client_module
.
attention
.
output
.
dense
.
bias
,
\
def
mlp
(
self
):
if
self
.
pre_attn_norm
:
intermediate_ff
=
self
.
client_module
.
intermediate
.
dense_act
else
:
intermediate_ff
=
self
.
client_module
.
intermediate
.
dense
return
intermediate_ff
.
weight
,
intermediate_ff
.
bias
,
\
self
.
client_module
.
output
.
dense
.
weight
,
\
self
.
client_module
.
output
.
dense
.
bias
def
layernorm
(
self
):
if
self
.
pre_attn_norm
:
attention_layernorm
=
self
.
client_module
.
PostAttentionLayerNorm
transformer_layernorm
=
self
.
client_module
.
PreAttentionLayerNorm
else
:
attention_layernorm
=
self
.
client_module
.
attention
.
output
.
LayerNorm
transformer_layernorm
=
self
.
client_module
.
output
.
LayerNorm
return
attention_layernorm
.
weight
,
\
attention_layernorm
.
bias
,
\
transformer_layernorm
.
weight
,
\
transformer_layernorm
.
bias
deepspeed/module_inject/containers/bloom.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
from
deepspeed.model_implementations.transformers.ds_bloom
import
DeepSpeedBloomInference
from
..policy
import
TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
supported_models
=
{
None
}
class
DS_BloomContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
self
.
bigscience_bloom
=
True
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedBloomInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
def
attention_qkv_mp
(
self
,
mp_replace
):
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvw
,
self
.
qkvw
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvb
,
self
.
qkvb
)
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
'self_attention.query_key_value.weight'
,
\
'self_attention.query_key_value.bias'
,
\
'self_attention.dense.weight'
,
\
'self_attention.dense.bias'
,
\
'mlp.dense_h_to_4h.weight'
,
\
'mlp.dense_h_to_4h.bias'
,
\
'mlp.dense_4h_to_h.weight'
,
\
'mlp.dense_4h_to_h.bias'
,
\
'post_attention_layernorm.weight'
,
\
'post_attention_layernorm.bias'
,
\
'input_layernorm.weight'
,
\
'input_layernorm.bias'
)
for
i
in
range
(
0
,
2
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
],
qkv
=
True
,
megatron_v2
=
self
.
policy
.
is_megatron_v2
,
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
2
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
10
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
10
,
12
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
class
BLOOMLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
,
use_load_prefix
=
True
,
split_qkv
=
False
):
super
().
__init__
(
inference
,
linear_layer
=
True
,
use_load_prefix
=
use_load_prefix
,
split_qkv
=
split_qkv
)
self
.
client_module
=
client_module
try
:
import
transformers
BLOOMLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
bloom
.
modeling_bloom
.
BloomBlock
global
supported_models
supported_models
.
update
(
{
transformers
.
models
.
bloom
.
modeling_bloom
.
BloomModel
})
except
Exception
as
e
:
print
(
f
"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception:
{
e
}
"
)
BLOOMLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attention
.
hidden_size
,
\
self
.
client_module
.
self_attention
.
num_heads
def
attention
(
self
):
return
self
.
client_module
.
self_attention
.
query_key_value
.
weight
,
\
self
.
client_module
.
self_attention
.
query_key_value
.
bias
,
\
self
.
client_module
.
self_attention
.
dense
.
weight
,
\
self
.
client_module
.
self_attention
.
dense
.
bias
,
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
dense_h_to_4h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_h_to_4h
.
bias
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
post_attention_layernorm
.
weight
,
\
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
deepspeed/module_inject/containers/clip.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
class
DS_CLIPContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
class
HFCLIPLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
False
):
super
().
__init__
(
inference
,
pre_attn_norm
=
True
,
scale_attention
=
True
)
self
.
client_module
=
client_module
self
.
cuda_graph_supported
=
True
if
HFCLIPLayerPolicy
.
_orig_layer_class
is
None
:
try
:
import
transformers
HFCLIPLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
clip
.
modeling_clip
.
CLIPEncoderLayer
except
:
HFCLIPLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attn
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
self_attn
.
num_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
self_attn
.
q_proj
.
weight
qb
=
self
.
client_module
.
self_attn
.
q_proj
.
bias
kw
=
self
.
client_module
.
self_attn
.
k_proj
.
weight
kb
=
self
.
client_module
.
self_attn
.
k_proj
.
bias
vw
=
self
.
client_module
.
self_attn
.
v_proj
.
weight
vb
=
self
.
client_module
.
self_attn
.
v_proj
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
qkvb
,
\
self
.
client_module
.
self_attn
.
out_proj
.
weight
,
\
self
.
client_module
.
self_attn
.
out_proj
.
bias
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
fc1
.
weight
,
\
self
.
client_module
.
mlp
.
fc1
.
bias
,
\
self
.
client_module
.
mlp
.
fc2
.
weight
,
\
self
.
client_module
.
mlp
.
fc2
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
layer_norm2
.
weight
,
\
self
.
client_module
.
layer_norm2
.
bias
,
\
self
.
client_module
.
layer_norm1
.
weight
,
\
self
.
client_module
.
layer_norm1
.
bias
deepspeed/module_inject/containers/distil_bert.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_bert
import
DeepSpeedBERTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
class
DS_DistilBERTContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
self
.
triangular_masking
=
False
self
.
return_single_tuple
=
True
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedBERTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
class
HFDistilBertLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
False
,
preln
=
False
):
super
().
__init__
(
inference
)
self
.
client_module
=
client_module
self
.
preln
=
preln
self
.
cuda_graph_supported
=
True
if
HFDistilBertLayerPolicy
.
_orig_layer_class
is
None
:
try
:
import
transformers
HFDistilBertLayerPolicy
.
_orig_layer_class
=
[
transformers
.
models
.
distilbert
.
modeling_distilbert
.
TransformerBlock
,
]
except
:
HFDistilBertLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attention
.
q_lin
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
n_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
attention
.
q_lin
.
weight
qb
=
self
.
client_module
.
attention
.
q_lin
.
bias
kw
=
self
.
client_module
.
attention
.
k_lin
.
weight
kb
=
self
.
client_module
.
attention
.
k_lin
.
bias
vw
=
self
.
client_module
.
attention
.
v_lin
.
weight
vb
=
self
.
client_module
.
attention
.
v_lin
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
))
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
))
return
qkvw
,
\
qkvb
,
\
self
.
client_module
.
attention
.
out_lin
.
weight
,
\
self
.
client_module
.
attention
.
out_lin
.
bias
def
mlp
(
self
):
intermediate_ff
=
self
.
client_module
.
ffn
.
lin1
return
intermediate_ff
.
weight
,
intermediate_ff
.
bias
,
\
self
.
client_module
.
ffn
.
lin2
.
weight
,
\
self
.
client_module
.
ffn
.
lin2
.
bias
def
layernorm
(
self
):
attention_layernorm
=
self
.
client_module
.
sa_layer_norm
transformer_layernorm
=
self
.
client_module
.
output_layer_norm
return
attention_layernorm
.
weight
,
\
attention_layernorm
.
bias
,
\
transformer_layernorm
.
weight
,
\
transformer_layernorm
.
bias
deepspeed/module_inject/containers/features/__init__.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.megatron
import
MegatronContainer
from
.meta_tensor
import
MetaTensorContainer
deepspeed/module_inject/containers/features/megatron.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
import
torch
from
abc
import
ABC
class
MegatronContainer
(
ABC
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
megatron_v2
=
self
.
policy
.
is_megatron_v2
def
transpose_qkv_alignment
(
self
,
x
):
attention_head_size
=
x
.
shape
[
-
1
]
//
self
.
num_attention_heads
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
attention_head_size
)
x_1
=
x
.
view
(
*
new_x_shape
)
(
q
,
k
,
v
)
=
torch
.
split
(
x_1
,
(
x_1
.
shape
[
-
1
]
//
3
),
dim
=
(
x_1
.
dim
()
-
1
))
if
len
(
q
.
shape
)
>
2
:
return
torch
.
cat
((
q
.
reshape
(
q
.
shape
[
0
],
-
1
),
k
.
reshape
(
q
.
shape
[
0
],
-
1
),
v
.
reshape
(
q
.
shape
[
0
],
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
)
else
:
return
torch
.
cat
((
q
.
reshape
(
-
1
),
k
.
reshape
(
-
1
),
v
.
reshape
(
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
)
def
transpose
(
self
):
super
().
transpose
()
if
self
.
megatron_v2
:
self
.
qkvw
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
transpose_qkv_alignment
(
self
.
qkvw
).
contiguous
())
self
.
qkvb
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
transpose_qkv_alignment
(
self
.
qkvb
).
contiguous
())
deepspeed/module_inject/containers/features/meta_tensor.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
abc
import
ABC
,
abstractmethod
class
MetaTensorContainer
(
ABC
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
is_meta
=
False
self
.
ckpt_load_enabled
=
True
def
initialize_tensors
(
self
):
super
().
initialize_tensors
()
self
.
is_meta
=
self
.
qkvw
.
is_meta
def
apply_tensor_parallelism
(
self
,
mp_replace
):
if
self
.
is_meta
:
if
self
.
qkvb
is
None
:
self
.
module
.
attention
.
attn_qkvb
=
None
if
self
.
dense_b
is
None
:
self
.
module
.
attention
.
attn_ob
=
None
else
:
super
().
apply_tensor_parallelism
(
mp_replace
)
def
copy_data_to_new_module
(
self
):
if
self
.
is_meta
:
if
self
.
attn_nw
is
None
:
self
.
module
.
mlp
.
attn_nw
=
self
.
attn_nw
self
.
module
.
mlp
.
attn_nb
=
self
.
attn_nb
else
:
super
().
copy_data_to_new_module
()
def
transpose
(
self
):
if
not
self
.
is_meta
:
super
().
transpose
()
@
abstractmethod
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
"""
Load all the transformer parameter from the checkpoint file (sd).
In addition to the parameter names, we require two
more parameters to help read the the data correctly
from the checkpoint and split the qkv heads in the
right order:
1. `use_load_prefix` (Default: False): this specifies
whether we need to use the name of first abstraction
layer of the model for searching the parameter's name
in a checkpoint file. For more information of how this
is used please see
https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
2. `split_qkv` (Default: True): we use this flag when splitting
the qkv parameter into heads. If it is False, it means the heads
of q, k, and v are stored together and needs to split in the
DeepSpeed-Inference API.
"""
raise
NotImplementedError
(
"A load_params() function must be defined in the model container
\
when inheriting the MetaTensorContainer feature"
)
deepspeed/module_inject/containers/gpt2.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
from
..policy
import
TransformerPolicy
class
DS_GPT2Container
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
class
HFGPT2LayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
):
# HuggingFace GPT2 uses convolutional layer instead of linear layer
super
().
__init__
(
inference
,
linear_layer
=
False
)
self
.
client_module
=
client_module
try
:
import
transformers
HFGPT2LayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
gpt2
.
modeling_gpt2
.
GPT2Block
except
:
HFGPT2LayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
embed_dim
,
\
self
.
client_module
.
attn
.
num_heads
def
attention
(
self
):
return
self
.
client_module
.
attn
.
c_attn
.
weight
,
\
self
.
client_module
.
attn
.
c_attn
.
bias
,
\
self
.
client_module
.
attn
.
c_proj
.
weight
,
\
self
.
client_module
.
attn
.
c_proj
.
bias
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
c_fc
.
weight
,
\
self
.
client_module
.
mlp
.
c_fc
.
bias
,
\
self
.
client_module
.
mlp
.
c_proj
.
weight
,
\
self
.
client_module
.
mlp
.
c_proj
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
ln_2
.
weight
,
\
self
.
client_module
.
ln_2
.
bias
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
deepspeed/module_inject/containers/gptj.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
class
DS_GPTJContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
'attn.q_proj.weight'
,
\
'attn.k_proj.weight'
,
\
'attn.v_proj.weight'
,
\
'attn.out_proj.weight'
,
\
'mlp.fc_in.weight'
,
\
'mlp.fc_in.bias'
,
\
'mlp.fc_out.weight'
,
\
'mlp.fc_out.bias'
,
\
'ln_1.weight'
,
\
'ln_1.bias'
)
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
8
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
8
,
10
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
+
2
],
prefix
+
param_names
[
i
])
class
HFGPTJLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
,
scale_attention
=
True
)
self
.
client_module
=
client_module
try
:
import
transformers
HFGPTJLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
gptj
.
modeling_gptj
.
GPTJBlock
except
:
HFGPTJLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attn
.
num_attention_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
attn
.
q_proj
.
weight
kw
=
self
.
client_module
.
attn
.
k_proj
.
weight
vw
=
self
.
client_module
.
attn
.
v_proj
.
weight
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
None
,
\
self
.
client_module
.
attn
.
out_proj
.
weight
,
\
None
,
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
fc_in
.
weight
,
\
self
.
client_module
.
mlp
.
fc_in
.
bias
,
\
self
.
client_module
.
mlp
.
fc_out
.
weight
,
\
self
.
client_module
.
mlp
.
fc_out
.
bias
def
layernorm
(
self
):
return
None
,
\
None
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
deepspeed/module_inject/containers/gptneo.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
class
DS_GPTNEOContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
'attn.attention.q_proj.weight'
,
\
'attn.attention.k_proj.weight'
,
\
'attn.attention.v_proj.weight'
,
\
'attn.attention.out_proj.weight'
,
\
'attn.attention.out_proj.bias'
,
\
'mlp.c_fc.weight'
,
\
'mlp.c_fc.bias'
,
\
'mlp.c_proj.weight'
,
\
'mlp.c_proj.bias'
,
\
'ln_2.weight'
,
\
'ln_2.bias'
,
\
'ln_1.weight'
,
\
'ln_1.bias'
)
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
5
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
5
,
11
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
11
,
13
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
class
HFGPTNEOLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
,
scale_attention
=
False
)
self
.
client_module
=
client_module
try
:
import
transformers
HFGPTNEOLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
gpt_neo
.
modeling_gpt_neo
.
GPTNeoBlock
except
:
HFGPTNEOLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
attention
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attn
.
attention
.
num_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
attn
.
attention
.
q_proj
.
weight
kw
=
self
.
client_module
.
attn
.
attention
.
k_proj
.
weight
vw
=
self
.
client_module
.
attn
.
attention
.
v_proj
.
weight
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
None
,
\
self
.
client_module
.
attn
.
attention
.
out_proj
.
weight
,
\
self
.
client_module
.
attn
.
attention
.
out_proj
.
bias
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
c_fc
.
weight
,
\
self
.
client_module
.
mlp
.
c_fc
.
bias
,
\
self
.
client_module
.
mlp
.
c_proj
.
weight
,
\
self
.
client_module
.
mlp
.
c_proj
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
ln_2
.
weight
,
\
self
.
client_module
.
ln_2
.
bias
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
deepspeed/module_inject/containers/gptneox.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
from
.features.megatron
import
MegatronContainer
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
import
torch
from
..policy
import
TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
packaging
import
version
as
pkg_version
class
DS_GPTNEOXContainer
(
MetaTensorContainer
,
MegatronContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
if
self
.
megatron_v2
:
self
.
module
.
config
.
rotate_half
=
True
self
.
module
.
config
.
rotate_every_two
=
False
return
self
.
module
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
'attention.query_key_value.weight'
,
\
'attention.query_key_value.bias'
,
\
'attention.dense.weight'
,
\
'attention.dense.bias'
,
\
'mlp.dense_h_to_4h.weight'
,
\
'mlp.dense_h_to_4h.bias'
,
\
'mlp.dense_4h_to_h.weight'
,
\
'mlp.dense_4h_to_h.bias'
,
\
'post_attention_layernorm.weight'
,
\
'post_attention_layernorm.bias'
,
\
'input_layernorm.weight'
,
\
'input_layernorm.bias'
)
for
i
in
range
(
0
,
2
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
],
qkv
=
True
,
megatron_v2
=
self
.
policy
.
is_megatron_v2
,
split_qkv
=
self
.
policy
.
split_qkv
,
heads
=
self
.
policy
.
client_module
.
attention
.
num_attention_heads
)
for
i
in
range
(
2
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
10
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
10
,
12
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
class
GPTNEOXLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
version
=
0
def
__init__
(
self
,
client_module
,
inference
=
True
,
megatron_v2
=
True
,
split_qkv
=
False
):
super
().
__init__
(
inference
,
megatron_v2
=
megatron_v2
,
split_qkv
=
split_qkv
)
self
.
client_module
=
client_module
if
GPTNEOXLayerPolicy
.
_orig_layer_class
is
None
:
if
pkg_version
.
parse
(
torch
.
__version__
)
<=
pkg_version
.
parse
(
"1.2"
):
GPTNEOXLayerPolicy
.
_orig_layer_class
=
None
else
:
try
:
from
transformers
import
GPTNeoXLayer
GPTNEOXLayerPolicy
.
_orig_layer_class
=
GPTNeoXLayer
except
ImportError
:
GPTNEOXLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
if
GPTNEOXLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
else
:
attention
=
self
.
client_module
.
self_attention
return
self
.
client_module
.
attention
.
query_key_value
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
num_attention_heads
def
attention
(
self
):
if
GPTNEOXLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
else
:
attention
=
self
.
client_module
.
self_attention
return
attention
.
query_key_value
.
weight
,
\
attention
.
query_key_value
.
bias
,
\
attention
.
dense
.
weight
,
\
attention
.
dense
.
bias
def
mlp
(
self
):
return
self
.
client_module
.
mlp
.
dense_h_to_4h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_h_to_4h
.
bias
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
post_attention_layernorm
.
weight
,
\
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
deepspeed/module_inject/containers/megatron_gpt.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.megatron
import
MegatronContainer
from
deepspeed.model_implementations.transformers.ds_megatron_gpt
import
DeepSpeedMegatronGPTInference
import
torch
from
..policy
import
TransformerPolicy
from
packaging
import
version
as
pkg_version
class
DS_MegatronGPTContainer
(
MegatronContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedMegatronGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
if
self
.
megatron_v2
:
self
.
module
.
config
.
rotate_half
=
True
self
.
module
.
config
.
rotate_every_two
=
False
return
self
.
module
# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
# TODO: Generalize MoE overall goal, expand beyond Megatron
class
MegatronLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
version
=
0
moe_type
=
'standard'
megatron_v2
=
True
use_mup
=
False
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
,
megatron_v2
=
MegatronLayerPolicy
.
megatron_v2
,
use_mup
=
MegatronLayerPolicy
.
use_mup
)
self
.
client_module
=
client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
if
MegatronLayerPolicy
.
_orig_layer_class
is
None
:
if
pkg_version
.
parse
(
torch
.
__version__
)
<=
pkg_version
.
parse
(
"1.2"
):
MegatronLayerPolicy
.
_orig_layer_class
=
None
else
:
try
:
from
megatron.model.transformer
import
ParallelTransformerLayer
MegatronLayerPolicy
.
_orig_layer_class
=
ParallelTransformerLayer
except
ImportError
:
MegatronLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attention
.
query_key_value
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
num_attention_heads
def
attention
(
self
):
if
self
.
inference
:
if
MegatronLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
else
:
attention
=
self
.
client_module
.
self_attention
return
attention
.
query_key_value
.
weight
,
\
attention
.
query_key_value
.
bias
,
\
attention
.
dense
.
weight
,
\
attention
.
dense
.
bias
def
mlp
(
self
,
moe_type
=
'standard'
):
from
deepspeed.moe.utils
import
has_moe_layers
moe
,
_
=
has_moe_layers
(
self
.
client_module
)
if
moe
:
moe_experts
=
self
.
client_module
.
mlp
.
deepspeed_moe
.
experts
.
deepspeed_experts
if
moe_type
==
'standard'
else
\
self
.
client_module
.
mlp
.
moe
.
deepspeed_moe
.
experts
.
deepspeed_experts
num_experts
=
len
(
moe_experts
)
if
moe_type
==
'standard'
:
return
[
moe_experts
[
i
].
dense_h_to_4h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_h_to_4h
.
bias
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
bias
for
i
in
range
(
num_experts
)]
else
:
return
[
moe_experts
[
i
].
dense_h_to_4h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_h_to_4h
.
bias
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
bias
for
i
in
range
(
num_experts
)],
\
self
.
client_module
.
mlp
.
mlp
.
dense_h_to_4h
.
weight
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_h_to_4h
.
bias
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_4h_to_h
.
weight
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_4h_to_h
.
bias
,
\
self
.
client_module
.
mlp
.
coefficient
.
weight
else
:
return
self
.
client_module
.
mlp
.
dense_h_to_4h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_h_to_4h
.
bias
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
weight
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
post_attention_layernorm
.
weight
,
\
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
deepspeed/module_inject/containers/megatron_gpt_moe.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.base_moe
import
*
from
.features.megatron
import
MegatronContainer
from
deepspeed.model_implementations.transformers.ds_megatron_gpt
import
DeepSpeedMegatronGPTInference
import
torch
from
.megatron_gpt
import
MegatronLayerPolicy
from
packaging
import
version
as
pkg_version
class
DS_MegatronGPTMoEContainer
(
MegatronContainer
,
BaseTransformerMoEContainer
):
def
__init__
(
self
,
policy
,
config
,
model_config
,
layer_id
):
super
().
__init__
(
policy
,
config
,
model_config
,
layer_id
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedMegatronGPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
if
self
.
megatron_v2
:
self
.
module
.
config
.
rotate_half
=
True
self
.
module
.
config
.
rotate_every_two
=
False
return
self
.
module
# TODO: Megatron GPT MoE inherits from Megatron policy and replaces mlp
# TODO: Generalize MoE overall goal, expand beyond Megatron
class
MegatronMoELayerPolicy
(
MegatronLayerPolicy
):
_orig_layer_class
=
None
version
=
0
moe_type
=
'standard'
num_experts
=
1
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
)
self
.
client_module
=
client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
if
MegatronMoELayerPolicy
.
_orig_layer_class
is
None
:
if
pkg_version
.
parse
(
torch
.
__version__
)
<=
pkg_version
.
parse
(
"1.2"
):
MegatronMoELayerPolicy
.
_orig_layer_class
=
None
else
:
try
:
from
megatron.model.transformer
import
ParallelTransformerLayer
MegatronMoELayerPolicy
.
_orig_layer_class
=
ParallelTransformerLayer
except
ImportError
:
MegatronMoELayerPolicy
.
_orig_layer_class
=
None
def
get_num_experts
(
self
):
return
self
.
num_experts
def
mlp
(
self
,
moe_type
=
'standard'
):
# for now, all of this is tightly coupled to megatron-deepspeed moe implementation
# todo: think and refactor this to be more general
#from deepspeed.moe.utils import has_moe_layers
#moe, _ = has_moe_layers(self.client_module)
moe_experts
=
self
.
client_module
.
mlp
.
deepspeed_moe
.
experts
.
deepspeed_experts
if
moe_type
==
'standard'
else
\
self
.
client_module
.
mlp
.
moe
.
deepspeed_moe
.
experts
.
deepspeed_experts
num_experts
=
len
(
moe_experts
)
self
.
num_experts
=
num_experts
if
moe_type
==
'standard'
:
return
[
moe_experts
[
i
].
dense_h_to_4h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_h_to_4h
.
bias
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
bias
for
i
in
range
(
num_experts
)]
else
:
return
[
moe_experts
[
i
].
dense_h_to_4h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_h_to_4h
.
bias
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
weight
for
i
in
range
(
num_experts
)],
\
[
moe_experts
[
i
].
dense_4h_to_h
.
bias
for
i
in
range
(
num_experts
)],
\
self
.
client_module
.
mlp
.
mlp
.
dense_h_to_4h
.
weight
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_h_to_4h
.
bias
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_4h_to_h
.
weight
,
\
self
.
client_module
.
mlp
.
mlp
.
dense_4h_to_h
.
bias
,
\
self
.
client_module
.
mlp
.
coefficient
.
weight
deepspeed/module_inject/containers/opt.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
from
deepspeed.model_implementations.transformers.ds_opt
import
DeepSpeedOPTInference
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
from
deepspeed.utils.types
import
ActivationFuncType
class
DS_OPTContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
# All model specific things should be defined here instead of the base class.
def
create_module
(
self
,
config
=
None
):
_config
=
config
if
config
is
not
None
else
self
.
ds_model_config
self
.
module
=
DeepSpeedOPTInference
(
_config
,
mp_group
=
self
.
mp_group
)
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
'self_attn.q_proj.weight'
,
\
'self_attn.k_proj.weight'
,
\
'self_attn.v_proj.weight'
,
\
'self_attn.q_proj.bias'
,
\
'self_attn.k_proj.bias'
,
\
'self_attn.v_proj.bias'
,
\
'self_attn.out_proj.weight'
,
\
'self_attn.out_proj.bias'
,
\
'fc1.weight'
,
\
'fc1.bias'
,
\
'fc2.weight'
,
\
'fc2.bias'
,
\
'final_layer_norm.weight'
,
\
'final_layer_norm.bias'
,
\
'self_attn_layer_norm.weight'
,
\
'self_attn_layer_norm.bias'
)
for
i
in
range
(
0
,
6
,
3
):
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
//
3
],
[
prefix
+
param_names
[
i
],
prefix
+
param_names
[
i
+
1
],
prefix
+
param_names
[
i
+
2
]
],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
6
,
8
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
for
i
in
range
(
8
,
14
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
for
i
in
range
(
14
,
16
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
class
HFOPTLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
,
use_load_prefix
=
True
):
super
().
__init__
(
inference
,
linear_layer
=
True
,
mlp_act_func_type
=
ActivationFuncType
.
ReLU
,
pre_attn_norm
=
True
,
use_load_prefix
=
use_load_prefix
)
self
.
client_module
=
client_module
try
:
import
transformers
HFOPTLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
opt
.
modeling_opt
.
OPTDecoderLayer
if
isinstance
(
TransformerPolicy
.
hf_model_config
,
transformers
.
models
.
opt
.
configuration_opt
.
OPTConfig
):
self
.
pre_attn_norm
=
TransformerPolicy
.
hf_model_config
.
do_layer_norm_before
except
:
HFOPTLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attn
.
embed_dim
,
\
self
.
client_module
.
self_attn
.
num_heads
def
attention
(
self
):
qw
=
self
.
client_module
.
self_attn
.
q_proj
.
weight
qb
=
self
.
client_module
.
self_attn
.
q_proj
.
bias
kw
=
self
.
client_module
.
self_attn
.
k_proj
.
weight
kb
=
self
.
client_module
.
self_attn
.
k_proj
.
bias
vw
=
self
.
client_module
.
self_attn
.
v_proj
.
weight
vb
=
self
.
client_module
.
self_attn
.
v_proj
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
qkvb
,
\
self
.
client_module
.
self_attn
.
out_proj
.
weight
,
\
self
.
client_module
.
self_attn
.
out_proj
.
bias
def
mlp
(
self
):
return
self
.
client_module
.
fc1
.
weight
,
\
self
.
client_module
.
fc1
.
bias
,
\
self
.
client_module
.
fc2
.
weight
,
\
self
.
client_module
.
fc2
.
bias
def
layernorm
(
self
):
return
self
.
client_module
.
final_layer_norm
.
weight
,
\
self
.
client_module
.
final_layer_norm
.
bias
,
\
self
.
client_module
.
self_attn_layer_norm
.
weight
,
\
self
.
client_module
.
self_attn_layer_norm
.
bias
deepspeed/module_inject/containers/unet.py
0 → 100644
View file @
67ea635f
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
import
torch
from
torch.nn.parameter
import
Parameter
from
..policy
import
DSPolicy
from
...model_implementations.diffusers.unet
import
DSUNet
class
UNetPolicy
(
DSPolicy
):
def
__init__
(
self
):
super
().
__init__
()
try
:
import
diffusers
self
.
_orig_layer_class
=
diffusers
.
models
.
unet_2d_condition
.
UNet2DConditionModel
except
ImportError
:
self
.
_orig_layer_class
=
None
def
match
(
self
,
module
):
return
isinstance
(
module
,
self
.
_orig_layer_class
)
def
match_replaced
(
self
,
module
):
return
isinstance
(
module
,
DSUNet
)
def
apply
(
self
,
module
,
enable_cuda_graph
=
True
):
# TODO(cmikeh2): Enable cuda graph should be an inference configuration
return
DSUNet
(
module
,
enable_cuda_graph
=
enable_cuda_graph
)
def
attention
(
self
,
client_module
):
qw
=
client_module
.
to_q
.
weight
kw
=
client_module
.
to_k
.
weight
vw
=
client_module
.
to_v
.
weight
if
qw
.
shape
[
1
]
==
kw
.
shape
[
1
]:
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
return
qkvw
,
\
client_module
.
to_out
[
0
].
weight
,
\
client_module
.
to_out
[
0
].
bias
,
\
qw
.
shape
[
-
1
],
\
client_module
.
heads
else
:
#return None
#kvw = Parameter(torch.cat((kw, vw), dim=0), requires_grad=False)
return
qw
,
\
kw
,
vw
,
\
client_module
.
to_out
[
0
].
weight
,
\
client_module
.
to_out
[
0
].
bias
,
\
qw
.
shape
[
-
1
],
\
client_module
.
heads
deepspeed/module_inject/containers/vae.py
0 → 100644
View file @
67ea635f
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
from
..policy
import
DSPolicy
from
...model_implementations.diffusers.vae
import
DSVAE
class
VAEPolicy
(
DSPolicy
):
def
__init__
(
self
):
super
().
__init__
()
try
:
import
diffusers
if
hasattr
(
diffusers
.
models
.
vae
,
"AutoencoderKL"
):
self
.
_orig_layer_class
=
diffusers
.
models
.
vae
.
AutoencoderKL
else
:
# Diffusers >= 0.12.0 changes location of AutoencoderKL
self
.
_orig_layer_class
=
diffusers
.
models
.
autoencoder_kl
.
AutoencoderKL
except
ImportError
:
self
.
_orig_layer_class
=
None
def
match
(
self
,
module
):
return
isinstance
(
module
,
self
.
_orig_layer_class
)
def
match_replaced
(
self
,
module
):
return
isinstance
(
module
,
DSVAE
)
def
apply
(
self
,
module
,
enable_cuda_graph
=
True
):
# TODO(cmikeh2): Enable cuda graph should be an inference configuration
return
DSVAE
(
module
,
enable_cuda_graph
=
enable_cuda_graph
)
# NOTE (lekurile): Should we have a diffusers policy class?
def
attention
(
self
):
pass
deepspeed/module_inject/inject.py
100644 → 100755
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
import
copy
import
torch
from
deepspeed.ops.transformer
import
DeepSpeedTransformerLayer
,
DeepSpeedTransformerConfig
...
...
deepspeed/module_inject/layers.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
import
torch
from
deepspeed
import
comm
as
dist
from
torch
import
nn
from
torch.nn
import
functional
as
F
from
torch.nn.parameter
import
Parameter
from
deepspeed.accelerator
import
get_accelerator
class
LinearAllreduce
(
nn
.
Module
):
def
__init__
(
self
,
weight
,
bias
=
None
,
mp_group
=
None
):
super
(
LinearAllreduce
,
self
).
__init__
()
self
.
weight
=
weight
self
.
bias
=
bias
self
.
mp_group
=
mp_group
def
forward
(
self
,
input
):
output
=
torch
.
matmul
(
input
,
self
.
weight
.
transpose
(
-
1
,
-
2
))
if
self
.
mp_group
is
not
None
:
dist
.
all_reduce
(
output
,
group
=
self
.
mp_group
)
if
self
.
bias
is
not
None
:
output
+=
self
.
bias
return
output
class
LinearLayer
(
nn
.
Module
):
def
__init__
(
self
,
weight_shape
=
None
,
dtype
=
torch
.
half
,
weight
=
None
,
bias
=
None
):
super
(
LinearLayer
,
self
).
__init__
()
if
weight
is
not
None
:
self
.
weight
=
weight
self
.
bias
=
bias
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
weight_shape
,
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
self
.
bias
=
Parameter
(
torch
.
empty
(
weight_shape
[
0
],
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
\
if
bias
is
not
None
else
None
def
forward
(
self
,
input
):
output
=
torch
.
matmul
(
input
,
self
.
weight
.
transpose
(
-
1
,
-
2
))
if
self
.
bias
is
not
None
:
output
+=
self
.
bias
return
output
class
Normalize
(
nn
.
Module
):
def
__init__
(
self
,
dim
,
dtype
=
torch
.
float
,
eps
=
1e-5
):
super
(
Normalize
,
self
).
__init__
()
self
.
norm
=
nn
.
LayerNorm
(
dim
,
eps
=
eps
).
to
(
dtype
).
to
(
get_accelerator
().
current_device_name
())
self
.
weight
=
self
.
norm
.
weight
self
.
bias
=
self
.
norm
.
bias
def
forward
(
self
,
input
):
return
self
.
norm
(
input
)
class
EmbeddingLayer
(
nn
.
Module
):
def
__init__
(
self
,
weight_shape
,
dtype
=
torch
.
half
):
super
(
EmbeddingLayer
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
empty
(
weight_shape
[
0
],
weight_shape
[
1
],
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
def
forward
(
self
,
input
):
return
F
.
embedding
(
input
,
self
.
weight
)
class
OPTEmbedding
(
EmbeddingLayer
):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def
__init__
(
self
,
weight_shape
):
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self
.
offset
=
2
super
().
__init__
(
weight_shape
)
def
forward
(
self
,
attention_mask
:
torch
.
LongTensor
,
past_key_values_length
:
int
=
0
):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask
=
attention_mask
.
long
()
# create positions depending on attention_mask
positions
=
(
torch
.
cumsum
(
attention_mask
,
dim
=
1
).
type_as
(
attention_mask
)
*
attention_mask
).
long
()
-
1
# cut positions if `past_key_values_length` is > 0
positions
=
positions
[:,
past_key_values_length
:]
return
super
().
forward
(
positions
+
self
.
offset
)
deepspeed/module_inject/load_checkpoint.py
0 → 100644
View file @
67ea635f
'''Copyright The Microsoft DeepSpeed Team'''
from
torch
import
nn
from
deepspeed.model_implementations.transformers.ds_bloom
import
DeepSpeedBloomInference
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
from
deepspeed.model_implementations.transformers.ds_bert
import
DeepSpeedBERTInference
from
deepspeed.model_implementations.transformers.ds_megatron_gpt
import
DeepSpeedMegatronGPTInference
from
deepspeed.model_implementations.transformers.ds_opt
import
DeepSpeedOPTInference
import
deepspeed.ops.transformer
as
transformer_inference
from
.layers
import
LinearLayer
,
Normalize
,
EmbeddingLayer
,
OPTEmbedding
import
torch
import
gc
from
deepspeed.accelerator
import
get_accelerator
def
load_model_with_checkpoint
(
r_module
,
sd
,
mp_replace
,
ckpt_type
,
ckpt_mp_size
,
weight_quantizer
=
None
,
rank
=
0
,
container
=
None
):
error_msgs
=
[]
def
transpose
(
data
):
with
torch
.
no_grad
():
data
=
data
.
contiguous
()
data1
=
data
.
transpose
(
-
1
,
-
2
).
reshape
(
-
1
)
data
.
reshape
(
-
1
).
copy_
(
data1
)
data1
=
None
return
data
.
reshape
(
data
.
shape
[
-
1
],
data
.
shape
[
-
2
])
def
load
(
module
,
prefix
):
args
=
(
sd
[
0
],
prefix
,
{},
True
,
[],
[],
error_msgs
)
if
hasattr
(
module
,
'weight'
):
module
.
weight
=
mp_replace
.
copy
(
module
.
weight
.
data
,
sd
[
0
][
prefix
+
'weight'
])
if
prefix
+
'bias'
in
sd
[
0
].
keys
():
if
module
.
bias
.
data
.
is_meta
:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module
.
bias
=
torch
.
nn
.
parameter
.
Parameter
(
data
=
torch
.
empty_like
(
module
.
bias
.
data
,
device
=
"cpu"
),
requires_grad
=
module
.
bias
.
data
.
requires_grad
)
module
.
bias
=
mp_replace
.
copy
(
module
.
bias
.
data
,
sd
[
0
][
prefix
+
'bias'
])
args
=
None
gc
.
collect
()
def
load_transformer_layer
(
module
,
prefix
):
if
ckpt_type
==
"tp"
:
def
load_parameters
(
module
,
prefix
):
for
n
,
p
in
module
.
named_parameters
():
if
prefix
+
n
in
sd
[
0
]
and
len
(
n
.
split
(
'.'
))
==
1
:
if
type
(
sd
[
0
][
prefix
+
n
])
is
list
:
tmp_data
,
scale
=
sd
[
0
][
prefix
+
n
]
tmp_data
=
tmp_data
scale
=
scale
.
to
(
get_accelerator
().
current_device_name
())
# set the quantizer number of groups using the checkpoint scale shape
weight_quantizer
.
num_groups
=
scale
.
shape
[
0
]
else
:
tmp_data
=
sd
[
0
][
prefix
+
n
].
to
(
get_accelerator
().
current_device_name
())
scale
=
None
src_shape
=
tmp_data
.
shape
dst_shape
=
p
.
shape
inner_dim
=
1
if
tmp_data
.
dtype
==
torch
.
int8
else
0
outer_dim
=
0
if
tmp_data
.
dtype
==
torch
.
int8
else
1
if
(
len
(
src_shape
)
==
2
and
len
(
dst_shape
)
==
2
):
if
(
src_shape
[
inner_dim
]
==
dst_shape
[
0
]
and
src_shape
[
outer_dim
]
==
dst_shape
[
1
]):
if
tmp_data
.
dtype
!=
torch
.
int8
:
p
=
weight_quantizer
.
quantize
(
transpose
(
tmp_data
)
if
weight_quantizer
.
q_int8
else
tmp_data
)
else
:
p
=
torch
.
nn
.
parameter
.
Parameter
(
tmp_data
,
requires_grad
=
False
)
p
.
scale
=
scale
setattr
(
module
,
n
,
p
)
else
:
dim
=
inner_dim
if
src_shape
[
inner_dim
]
!=
dst_shape
[
0
]
else
outer_dim
dim1
=
0
if
src_shape
[
inner_dim
]
!=
dst_shape
[
0
]
else
1
if
src_shape
[
dim
]
>
dst_shape
[
dim1
]:
weight_partition
=
torch
.
split
(
tmp_data
,
dst_shape
[
dim1
],
dim
=
dim
)[
rank
].
to
(
get_accelerator
().
current_device_name
())
assert
tmp_data
.
dtype
!=
torch
.
int8
or
scale
.
numel
()
>
weight_quantizer
.
num_groups
*
(
rank
+
1
),
\
'''ERROR: We require the quantization scales for larger TP-size when loading INT8 checkpoint!
\
Please use the FP16 checkpoint to generate INT8 checkpoint with the sharding parameters!'''
scale
=
scale
.
view
(
-
1
)[
weight_quantizer
.
num_groups
*
(
rank
+
1
):].
reshape
(
weight_quantizer
.
num_groups
,
-
1
).
contiguous
()
else
:
assert
tmp_data
.
dtype
!=
torch
.
int8
,
\
'''Merging of the checkpoints are not supported when using INT8 checkpoint!
\
Please use a as many GPUs as TP-size for the checkpoint'''
all_data
=
[
sd
[
j
][
prefix
+
n
]
if
type
(
sd
[
j
][
prefix
+
n
])
is
list
else
sd
[
j
][
prefix
+
n
].
to
(
get_accelerator
().
current_device_name
())
for
j
in
range
(
len
(
sd
))
]
# Check if the weight tensor is for the QKV parameter
if
src_shape
[
1
]
==
(
3
*
src_shape
[
0
])
//
ckpt_mp_size
:
qkv_size
=
src_shape
[
outer_dim
]
//
3
src_split
=
[
torch
.
split
(
src
[
0
].
data
,
qkv_size
,
dim
=
outer_dim
)
for
src
in
all_data
]
weight_partition
=
torch
.
cat
([
torch
.
cat
([
qkv_s
[
i
]
for
qkv_s
in
src_split
],
axis
=
outer_dim
)
for
i
in
range
(
len
(
src_split
[
0
]))
],
dim
=
dim
)
else
:
weight_partition
=
torch
.
cat
([
ad
[
0
].
to
(
get_accelerator
().
current_device_name
())
if
type
(
ad
)
is
list
else
ad
for
ad
in
all_data
],
dim
=
dim
)
if
tmp_data
.
dtype
==
torch
.
int8
:
scale
=
torch
.
cat
([
ad
[
1
].
to
(
get_accelerator
().
current_device_name
())
for
ad
in
all_data
],
dim
=
dim
)
if
tmp_data
.
dtype
!=
torch
.
int8
:
weight_partition
=
weight_quantizer
.
quantize
(
transpose
(
weight_partition
),
\
parallel_dim
=
(
0
if
dim
==
1
else
1
))
if
weight_quantizer
.
q_int8
else
\
weight_quantizer
.
quantize
(
weight_partition
)
else
:
weight_partition
=
torch
.
nn
.
parameter
.
Parameter
(
weight_partition
,
requires_grad
=
False
)
weight_partition
.
scale
=
scale
setattr
(
module
,
n
,
weight_partition
)
else
:
if
src_shape
[
0
]
==
dst_shape
[
0
]:
p
.
data
.
copy_
(
tmp_data
)
else
:
if
src_shape
[
0
]
>
dst_shape
[
0
]:
bias_split
=
torch
.
split
(
tmp_data
,
dst_shape
[
-
1
])[
rank
].
to
(
get_accelerator
(
).
current_device_name
()).
contiguous
()
p
.
data
.
copy_
(
bias_split
)
else
:
# Check if the weight tensor is for the QKV parameter
if
src_shape
[
0
]
==
(
3
*
r_module
.
config
.
hidden_size
)
//
ckpt_mp_size
:
qkv_size
=
src_shape
[
0
]
//
3
src_split
=
[
torch
.
split
(
sd
[
j
][
prefix
+
n
],
qkv_size
,
dim
=
0
)
for
j
in
range
(
len
(
sd
))
]
p
.
data
.
copy_
(
torch
.
cat
(
[
torch
.
cat
([
qkv_s
[
i
]
for
qkv_s
in
src_split
],
axis
=
0
)
for
i
in
range
(
len
(
src_split
[
0
]))
],
dim
=
0
).
to
(
get_accelerator
(
).
current_device_name
()).
contiguous
())
else
:
p
.
data
.
copy_
(
torch
.
cat
(
[
sd
[
j
][
prefix
+
n
]
for
j
in
range
(
len
(
sd
))
],
dim
=
0
).
to
(
get_accelerator
(
).
current_device_name
()).
contiguous
())
load_parameters
(
module
,
prefix
)
for
n
,
child
in
module
.
named_children
():
load_parameters
(
child
,
prefix
+
n
+
'.'
)
else
:
container
.
load_params
(
module
,
sd
[
0
],
weight_quantizer
,
mp_replace
,
prefix
)
try
:
import
transformers
OPTLearnedPositionalEmbedding
=
transformers
.
models
.
opt
.
modeling_opt
.
OPTLearnedPositionalEmbedding
except
:
OPTLearnedPositionalEmbedding
=
None
layer_policies
=
{
nn
.
Linear
:
load
,
nn
.
Embedding
:
load
,
nn
.
LayerNorm
:
load
,
EmbeddingLayer
:
load
,
LinearLayer
:
load
,
Normalize
:
load
,
transformer_inference
.
DeepSpeedTransformerInference
:
load_transformer_layer
,
DeepSpeedBloomInference
:
load_transformer_layer
,
DeepSpeedGPTInference
:
load_transformer_layer
,
DeepSpeedBERTInference
:
load_transformer_layer
,
DeepSpeedMegatronGPTInference
:
load_transformer_layer
,
DeepSpeedOPTInference
:
load_transformer_layer
,
OPTLearnedPositionalEmbedding
:
load
,
OPTEmbedding
:
load
}
all_ds_ids
=
{}
def
load_module_recursive
(
module
,
prefix
=
''
,
level
=
0
):
for
name
,
child
in
module
.
named_children
():
if
child
.
__class__
in
layer_policies
:
checking_key
=
prefix
+
name
+
'.'
if
not
any
(
checking_key
in
item
for
item
in
sd
[
0
].
keys
()):
if
hasattr
(
child
,
'weight'
)
and
\
(
hasattr
(
child
.
weight
,
'ds_id'
)
and
\
child
.
weight
.
ds_id
in
all_ds_ids
):
prefix1
=
all_ds_ids
[
child
.
weight
.
ds_id
]
if
child
.
__class__
is
nn
.
Linear
:
child
=
LinearLayer
(
weight
=
all_ds_ids
[
child
.
weight
.
ds_id
])
setattr
(
module
,
name
,
child
)
continue
child_params
=
list
(
child
.
parameters
())
if
len
(
child_params
)
>
0
and
(
child_params
[
0
].
numel
()
==
0
or
child_params
[
0
].
is_meta
):
if
child
.
weight
.
is_meta
:
ds_shape
=
child
.
weight
.
shape
else
:
ds_shape
=
child
.
weight
.
ds_shape
if
child
.
__class__
is
nn
.
LayerNorm
:
child
=
Normalize
(
dim
=
ds_shape
[
-
1
],
dtype
=
child
.
weight
.
dtype
,
eps
=
child
.
eps
)
setattr
(
module
,
name
,
child
)
elif
child
.
__class__
is
nn
.
Linear
:
child
=
LinearLayer
(
weight_shape
=
child
.
weight
.
shape
,
bias
=
child
.
bias
)
setattr
(
module
,
name
,
child
)
elif
child
.
__class__
is
OPTLearnedPositionalEmbedding
:
child
=
OPTEmbedding
(
weight_shape
=
ds_shape
)
setattr
(
module
,
name
,
child
)
else
:
ds_id
=
None
if
hasattr
(
child
.
weight
,
'ds_id'
):
ds_id
=
child
.
weight
.
ds_id
child
=
EmbeddingLayer
(
weight_shape
=
ds_shape
,
dtype
=
child
.
weight
.
dtype
)
if
ds_id
is
not
None
:
all_ds_ids
[
ds_id
]
=
child
.
weight
setattr
(
module
,
name
,
child
)
layer_policies
[
child
.
__class__
](
child
,
prefix
+
name
+
'.'
)
else
:
load_module_recursive
(
child
,
prefix
if
(
level
==
0
and
ckpt_type
==
'pp'
)
and
container
.
policy
.
use_load_prefix
else
\
prefix
+
name
+
'.'
,
level
+
1
)
load_module_recursive
(
r_module
)
embedding_weight
=
None
for
n
,
p
in
r_module
.
named_parameters
():
if
"word_embeddings."
in
n
or
"embed_tokens."
in
n
or
"wte."
in
n
:
embedding_weight
=
p
if
embedding_weight
is
not
None
and
r_module
.
lm_head
.
weight
.
is_meta
:
r_module
.
lm_head
.
weight
=
embedding_weight
for
sd_
in
sd
:
del
sd_
sd
=
None
gc
.
collect
()
Prev
1
…
7
8
9
10
11
12
13
14
15
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment