Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
5bcc463d
Commit
5bcc463d
authored
May 29, 2023
by
aiss
Browse files
update v0.9.2
parent
ac5fbab4
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
748 additions
and
350 deletions
+748
-350
deepspeed/module_inject/containers/base.py
deepspeed/module_inject/containers/base.py
+304
-58
deepspeed/module_inject/containers/base_moe.py
deepspeed/module_inject/containers/base_moe.py
+23
-34
deepspeed/module_inject/containers/bert.py
deepspeed/module_inject/containers/bert.py
+21
-5
deepspeed/module_inject/containers/bloom.py
deepspeed/module_inject/containers/bloom.py
+34
-40
deepspeed/module_inject/containers/clip.py
deepspeed/module_inject/containers/clip.py
+14
-2
deepspeed/module_inject/containers/distil_bert.py
deepspeed/module_inject/containers/distil_bert.py
+16
-5
deepspeed/module_inject/containers/features/__init__.py
deepspeed/module_inject/containers/features/__init__.py
+4
-1
deepspeed/module_inject/containers/features/megatron.py
deepspeed/module_inject/containers/features/megatron.py
+61
-16
deepspeed/module_inject/containers/features/meta_tensor.py
deepspeed/module_inject/containers/features/meta_tensor.py
+10
-7
deepspeed/module_inject/containers/gpt2.py
deepspeed/module_inject/containers/gpt2.py
+14
-3
deepspeed/module_inject/containers/gptj.py
deepspeed/module_inject/containers/gptj.py
+40
-28
deepspeed/module_inject/containers/gptneo.py
deepspeed/module_inject/containers/gptneo.py
+41
-28
deepspeed/module_inject/containers/gptneox.py
deepspeed/module_inject/containers/gptneox.py
+34
-23
deepspeed/module_inject/containers/megatron_gpt.py
deepspeed/module_inject/containers/megatron_gpt.py
+15
-6
deepspeed/module_inject/containers/megatron_gpt_moe.py
deepspeed/module_inject/containers/megatron_gpt_moe.py
+5
-1
deepspeed/module_inject/containers/opt.py
deepspeed/module_inject/containers/opt.py
+50
-36
deepspeed/module_inject/containers/unet.py
deepspeed/module_inject/containers/unet.py
+6
-3
deepspeed/module_inject/containers/vae.py
deepspeed/module_inject/containers/vae.py
+6
-3
deepspeed/module_inject/inject.py
deepspeed/module_inject/inject.py
+17
-29
deepspeed/module_inject/layers.py
deepspeed/module_inject/layers.py
+33
-22
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
deepspeed/module_inject/containers/base.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
# Create a container object to save model-specific tensors using the policy file above.
from
abc
import
ABC
...
...
@@ -15,6 +18,7 @@ class BaseConvolutionContainer(ABC):
class
BaseTransformerContainer
(
ABC
):
def
__init__
(
self
,
policy
,
config
,
model_config
,
layer_id
,
child
):
self
.
policy
=
policy
self
.
config
=
config
...
...
@@ -30,28 +34,22 @@ class BaseTransformerContainer(ABC):
self
.
hidden_size
=
None
self
.
num_attention_heads
=
None
self
.
mp_size
=
self
.
config
.
tensor_parallel
.
tp_size
self
.
pre_layer_norm
=
self
.
policy
.
pre_attn_norm
self
.
pre_layer_norm
=
self
.
model_config
.
do_layer_norm_before
if
\
hasattr
(
self
.
model_config
,
'do_layer_norm_before'
)
else
self
.
policy
.
pre_attn_norm
self
.
fp16
=
False
self
.
attn_linear_layer
=
self
.
policy
.
linear_layer
self
.
mlp_linear_layer
=
self
.
policy
.
linear_layer
self
.
layer_norm_eps
=
self
.
model_config
.
layer_norm_eps
if
\
hasattr
(
self
.
model_config
,
'layer_norm_eps'
)
else
(
self
.
model_config
.
layer_norm_epsilon
if
\
hasattr
(
self
.
model_config
,
'layer_norm_epsilon'
)
else
self
.
model_config
.
layernorm_epsilon
if
\
hasattr
(
self
.
model_config
,
'layernorm_epsilon'
)
else
1.0e-12
)
self
.
return_tuple
=
self
.
config
.
return_tuple
self
.
triangular_masking
=
True
self
.
local_attention
=
((
self
.
model_config
.
attention_layers
[
self
.
layer_id
]
==
"local"
)
if
hasattr
(
self
.
model_config
,
'attention_layers'
)
else
False
)
self
.
local_attention
=
((
self
.
model_config
.
attention_layers
[
self
.
layer_id
]
==
"local"
)
if
hasattr
(
self
.
model_config
,
'attention_layers'
)
else
False
)
self
.
window_size
=
getattr
(
self
.
model_config
,
"window_size"
,
1
)
self
.
mlp_act_func_type
=
self
.
policy
.
mlp_act_func_type
self
.
training_mp_size
=
self
.
config
.
training_mp_size
self
.
bigscience_bloom
=
False
self
.
max_out_tokens
=
self
.
config
.
max_out_tokens
self
.
scale_attn_by_inverse_layer_idx
=
getattr
(
self
.
config
,
"scale_attn_by_inverse_layer_idx"
,
False
)
self
.
min_out_tokens
=
self
.
config
.
min_out_tokens
self
.
scale_attn_by_inverse_layer_idx
=
getattr
(
self
.
config
,
"scale_attn_by_inverse_layer_idx"
,
False
)
self
.
use_mup
=
self
.
policy
.
use_mup
self
.
return_single_tuple
=
False
self
.
rotary_dim
=
self
.
model_config
.
rotary_dim
if
hasattr
(
self
.
model_config
,
'rotary_dim'
)
\
...
...
@@ -75,6 +73,8 @@ class BaseTransformerContainer(ABC):
self
.
input_nw
=
None
self
.
input_nb
=
None
self
.
mp_group
=
None
def
create_ds_model_config
(
self
):
self
.
set_hidden_heads
(
*
self
.
policy
.
get_hidden_heads
())
assert
self
.
num_attention_heads
%
self
.
mp_size
==
0
,
\
...
...
@@ -84,11 +84,11 @@ class BaseTransformerContainer(ABC):
self
.
ds_model_config
=
DeepSpeedInferenceConfig
(
hidden_size
=
self
.
hidden_size
,
heads
=
self
.
num_attention_heads
,
layer_norm_eps
=
self
.
layer
_
norm_eps
,
layer_norm_eps
=
self
.
layernorm_eps
ilon
,
fp16
=
self
.
fp16
,
pre_layer_norm
=
self
.
pre_layer_norm
,
mp_size
=
self
.
mp_size
,
q_int8
=
self
.
quantize
,
q_int8
=
self
.
quantize
if
hasattr
(
self
,
'quantize'
)
else
False
,
return_tuple
=
self
.
return_tuple
,
triangular_masking
=
self
.
triangular_masking
,
local_attention
=
self
.
local_attention
,
...
...
@@ -99,18 +99,24 @@ class BaseTransformerContainer(ABC):
training_mp_size
=
self
.
training_mp_size
,
bigscience_bloom
=
self
.
bigscience_bloom
,
max_out_tokens
=
self
.
max_out_tokens
,
min_out_tokens
=
self
.
min_out_tokens
,
scale_attn_by_inverse_layer_idx
=
self
.
scale_attn_by_inverse_layer_idx
,
use_mup
=
self
.
use_mup
,
return_single_tuple
=
self
.
return_single_tuple
,
)
set_empty_params
=
self
.
config
.
set_empty_params
,
transposed_mode
=
self
.
config
.
transposed_mode
)
return
self
.
ds_model_config
def
initialize_tensors
(
self
):
def
initialize_tensors
(
self
,
enable_training
=
False
):
# Set the tensors from policy (user module) to container (DS module)
self
.
set_attention
(
*
self
.
policy
.
attention
())
self
.
set_attention
(
*
self
.
policy
.
attention
(
enable_training
=
enable_training
))
self
.
set_mlp
(
*
self
.
policy
.
mlp
())
self
.
set_layernorm
(
*
self
.
policy
.
layernorm
())
self
.
set_lora_params
(
self
.
policy
.
get_lora_params
())
self
.
q_k_v
=
self
.
policy
.
get_q_k_v
()
if
self
.
q_k_v
is
not
None
:
self
.
set_q_k_v
(
*
self
.
q_k_v
)
def
convert_to_required_dtype
(
self
,
dtype
):
# Note: converting tensors to fp16 requires that we do it in-place using self.__dict__ and not make a list/dict copy
...
...
@@ -138,9 +144,10 @@ class BaseTransformerContainer(ABC):
self
.
quantize
=
quantize
self
.
quantizer
=
quantizer
def
set_hidden_heads
(
self
,
hidden_size
,
num_attention_heads
):
def
set_hidden_heads
(
self
,
hidden_size
,
num_attention_heads
,
epsilon
):
self
.
hidden_size
=
hidden_size
self
.
num_attention_heads
=
num_attention_heads
self
.
layernorm_epsilon
=
epsilon
def
set_attention
(
self
,
qkvw
,
qkvb
,
dense_w
,
dense_b
):
self
.
qkvw
=
qkvw
...
...
@@ -148,6 +155,17 @@ class BaseTransformerContainer(ABC):
self
.
dense_w
=
dense_w
self
.
dense_b
=
dense_b
def
set_lora_params
(
self
,
lora_params
):
self
.
lora_params
=
lora_params
def
set_q_k_v
(
self
,
qw
,
qb
,
kw
,
kb
,
vw
,
vb
):
self
.
qw
=
qw
self
.
qb
=
qb
self
.
kw
=
kw
self
.
kb
=
kb
self
.
vw
=
vw
self
.
vb
=
vb
def
set_mlp
(
self
,
_h4h_w
,
_h4h_b
,
_4hh_w
,
_4hh_b
):
self
.
_h4h_w
=
_h4h_w
self
.
_h4h_b
=
_h4h_b
...
...
@@ -168,63 +186,184 @@ class BaseTransformerContainer(ABC):
self
.
mlp_quantization
()
def
attention_quantization
(
self
):
self
.
module
.
attention
.
attn_qkvw
=
self
.
quantizer
.
quantize
(
self
.
module
.
attention
.
attn_qkvw
)
self
.
module
.
attention
.
attn_ow
=
self
.
quantizer
.
quantize
(
self
.
module
.
attention
.
attn_ow
)
self
.
module
.
attention
.
attn_qkvw
=
self
.
quantizer
.
quantize
(
self
.
module
.
attention
.
attn_qkvw
)
self
.
module
.
attention
.
attn_ow
=
self
.
quantizer
.
quantize
(
self
.
module
.
attention
.
attn_ow
)
def
mlp_quantization
(
self
):
self
.
module
.
mlp
.
inter_w
=
self
.
quantizer
.
quantize
(
self
.
module
.
mlp
.
inter_w
)
self
.
module
.
mlp
.
output_w
=
self
.
quantizer
.
quantize
(
self
.
module
.
mlp
.
output_w
)
def
apply_tensor_parallelism
(
self
,
mp_replace
):
def
apply_tensor_parallelism
(
self
,
mp_replace
=
None
,
mp_group
=
None
,
tp_size
=
None
):
reversed_dim
=
False
if
mp_replace
is
None
:
from
deepspeed.module_inject
import
ReplaceWithTensorSlicing
mp_replace
=
ReplaceWithTensorSlicing
(
mp_group
=
mp_group
,
mp_size
=
tp_size
,
out_dim
=
0
,
in_dim
=
1
)
reversed_dim
=
True
# setup the new Attention module
self
.
attention_qkv_mp
(
mp_replace
)
self
.
attention_o_mp
(
mp_replace
)
if
self
.
module
.
attention
.
attn_qkvw
is
None
:
self
.
attention_q_k_v_mp
(
mp_replace
,
reversed_dim
=
reversed_dim
)
else
:
self
.
attention_qkv_mp
(
mp_replace
,
reversed_dim
=
reversed_dim
)
self
.
attention_o_mp
(
mp_replace
,
reversed_dim
=
reversed_dim
)
# setup the new MLP module
self
.
mlp_inter_mp
(
mp_replace
)
self
.
mlp_output_mp
(
mp_replace
)
self
.
mlp_inter_mp
(
mp_replace
,
reversed_dim
=
reversed_dim
)
self
.
mlp_output_mp
(
mp_replace
,
reversed_dim
=
reversed_dim
)
# Apply weight quantization
self
.
apply_weight_quantization
()
def
attention_qkv_mp
(
self
,
mp_replace
):
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvw
,
self
.
qkvw
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvb
,
self
.
qkvb
)
def
attention_o_mp
(
self
,
mp_replace
):
self
.
module
.
attention
.
attn_ow
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_ow
,
self
.
dense_w
)
#self.apply_weight_quantization()
def
attention_qkv_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
if
reversed_dim
:
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvw
[:
self
.
qkvw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
qkvw
,
int8
=
reversed_dim
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvb
[:
self
.
qkvw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
qkvb
,
int8
=
reversed_dim
)
else
:
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvw
,
self
.
qkvw
,
int8
=
reversed_dim
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
qkv_copy
(
self
.
module
.
attention
.
attn_qkvb
,
self
.
qkvb
,
int8
=
reversed_dim
)
def
attention_q_k_v_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
self
.
module
.
attention
.
attn_qw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qw
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
qw
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
self
.
module
.
attention
.
attn_kw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_kw
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
kw
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
self
.
module
.
attention
.
attn_vw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_vw
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
vw
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
self
.
module
.
attention
.
attn_qb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qb
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
qb
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
if
self
.
module
.
attention
.
attn_qb
is
not
None
else
None
self
.
module
.
attention
.
attn_kb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_kb
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
kb
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
if
self
.
module
.
attention
.
attn_kb
is
not
None
else
None
self
.
module
.
attention
.
attn_vb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_vb
[:
self
.
qw
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
vb
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
if
self
.
module
.
attention
.
attn_vb
is
not
None
else
None
def
attention_o_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
if
reversed_dim
:
self
.
module
.
attention
.
attn_ow
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_ow
[:,
:
self
.
dense_w
.
shape
[
1
]
//
mp_replace
.
mp_size
],
self
.
dense_w
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
else
:
self
.
module
.
attention
.
attn_ow
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_ow
,
self
.
dense_w
,
int8
=
reversed_dim
)
self
.
module
.
attention
.
attn_ob
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_ob
,
self
.
dense_b
)
def
mlp_inter_mp
(
self
,
mp_replace
):
self
.
module
.
mlp
.
inter_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_w
,
self
.
_h4h_w
)
self
.
module
.
mlp
.
inter_b
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_b
,
self
.
_h4h_b
)
def
mlp_output_mp
(
self
,
mp_replace
):
self
.
module
.
mlp
.
output_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
output_w
,
self
.
_4hh_w
)
self
.
module
.
mlp
.
output_b
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
output_b
,
self
.
_4hh_b
)
self
.
dense_b
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
def
mlp_inter_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
if
reversed_dim
:
self
.
module
.
mlp
.
inter_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_w
[:
self
.
_h4h_w
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
_h4h_w
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
self
.
module
.
mlp
.
inter_b
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_b
[:
self
.
_h4h_w
.
shape
[
0
]
//
mp_replace
.
mp_size
],
self
.
_h4h_b
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
if
self
.
module
.
mlp
.
inter_b
is
not
None
else
None
else
:
self
.
module
.
mlp
.
inter_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_w
,
self
.
_h4h_w
,
int8
=
reversed_dim
)
self
.
module
.
mlp
.
inter_b
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
inter_b
,
self
.
_h4h_b
,
int8
=
reversed_dim
)
def
mlp_output_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
if
reversed_dim
:
self
.
module
.
mlp
.
output_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
output_w
[:,
:
self
.
_4hh_w
.
shape
[
1
]
//
mp_replace
.
mp_size
],
self
.
_4hh_w
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
else
:
self
.
module
.
mlp
.
output_w
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
output_w
,
self
.
_4hh_w
,
int8
=
reversed_dim
)
self
.
module
.
mlp
.
output_b
=
mp_replace
.
copy
(
self
.
module
.
mlp
.
output_b
,
self
.
_4hh_b
,
int8
=
reversed_dim
,
allocat_tensor
=
reversed_dim
)
def
release_qkv
(
self
):
del
self
.
module
.
attention
.
attn_qkvw
del
self
.
module
.
attention
.
attn_qkvb
self
.
module
.
attention
.
attn_qkvw
=
self
.
qkvw
self
.
module
.
attention
.
attn_qkvb
=
self
.
qkvb
if
self
.
module
.
attention
.
attn_qw
is
not
None
:
qkv_data
=
[
self
.
module
.
attention
.
attn_qw
.
data
,
\
self
.
module
.
attention
.
attn_qb
.
data
if
self
.
module
.
attention
.
attn_qb
is
not
None
else
None
,
\
self
.
module
.
attention
.
attn_kw
.
data
,
\
self
.
module
.
attention
.
attn_kb
.
data
if
self
.
module
.
attention
.
attn_kb
is
not
None
else
None
,
\
self
.
module
.
attention
.
attn_vw
.
data
,
\
self
.
module
.
attention
.
attn_vb
.
data
if
self
.
module
.
attention
.
attn_vb
is
not
None
else
None
]
for
data
in
qkv_data
:
del
data
self
.
module
.
attention
.
attn_qw
=
self
.
qw
self
.
module
.
attention
.
attn_qb
=
self
.
qb
self
.
module
.
attention
.
attn_kw
=
self
.
kw
self
.
module
.
attention
.
attn_kb
=
self
.
kb
self
.
module
.
attention
.
attn_vw
=
self
.
vw
self
.
module
.
attention
.
attn_vb
=
self
.
vb
def
release_memory
(
self
):
self
.
release_qkv
()
del
self
.
module
.
attention
.
attn_ow
del
self
.
module
.
attention
.
attn_ob
self
.
module
.
attention
.
attn_ow
=
self
.
dense_w
self
.
module
.
attention
.
attn_ob
=
self
.
dense_b
del
self
.
module
.
mlp
.
inter_w
del
self
.
module
.
mlp
.
inter_b
del
self
.
module
.
mlp
.
output_w
del
self
.
module
.
mlp
.
output_b
self
.
module
.
mlp
.
inter_w
=
self
.
_h4h_w
self
.
module
.
mlp
.
inter_b
=
self
.
_h4h_b
self
.
module
.
mlp
.
output_w
=
self
.
_4hh_w
self
.
module
.
mlp
.
output_b
=
self
.
_4hh_b
def
copy_data_to_new_module
(
self
):
if
self
.
attn_nw
is
None
:
self
.
module
.
mlp
.
attn_nw
=
self
.
attn_nw
self
.
module
.
mlp
.
attn_nb
=
self
.
attn_nb
else
:
self
.
module
.
mlp
.
attn_nw
.
data
.
copy_
(
self
.
attn_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
mlp
.
attn_nb
.
data
.
copy_
(
self
.
attn_nb
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
mlp
.
attn_nw
.
data
.
copy_
(
self
.
attn_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
mlp
.
attn_nb
.
data
.
copy_
(
self
.
attn_nb
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_w
.
data
.
copy_
(
self
.
input_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_b
.
data
.
copy_
(
self
.
input_nb
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_w
.
data
.
copy_
(
self
.
input_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_b
.
data
.
copy_
(
self
.
input_nb
.
to
(
get_accelerator
().
current_device_name
()))
def
align_merged_qkv
(
self
):
if
hasattr
(
self
,
'_align_merged_qkv'
):
self
.
_align_merged_qkv
()
def
partition_merged_qkv
(
self
):
if
hasattr
(
self
,
'_partition_merged_qkv'
):
self
.
_partition_merged_qkv
()
def
transpose
(
self
):
self
.
transpose_attention
()
...
...
@@ -246,3 +385,110 @@ class BaseTransformerContainer(ABC):
data
=
data
.
reshape
(
data
.
shape
[
-
1
],
data
.
shape
[
-
2
])
data
.
to
(
get_accelerator
().
current_device_name
())
return
data
def
reset_qkv_experimental
(
self
):
if
self
.
module
.
attention
.
attn_qkvw
is
None
:
self
.
module
.
attention
.
attn_qkvw
=
torch
.
empty
(
self
.
qw
.
shape
[
0
]
*
3
,
self
.
qw
.
shape
[
0
],
dtype
=
self
.
qw
.
dtype
,
device
=
self
.
qw
.
device
)
self
.
module
.
attention
.
attn_qkvb
=
torch
.
empty
(
self
.
qw
.
shape
[
0
]
*
3
,
dtype
=
self
.
qw
.
dtype
,
device
=
self
.
qw
.
device
)
self
.
module
.
attention
.
attn_qkvw
.
data
[:
self
.
qw
.
shape
[
0
]]
=
self
.
qw
.
data
self
.
module
.
attention
.
attn_qkvb
.
data
[:
self
.
qw
.
shape
[
0
]]
=
self
.
qb
.
data
self
.
module
.
attention
.
attn_qkvw
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
=
self
.
kw
.
data
self
.
module
.
attention
.
attn_qkvb
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
=
self
.
kb
.
data
self
.
module
.
attention
.
attn_qkvw
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
=
self
.
vw
.
data
self
.
module
.
attention
.
attn_qkvb
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
=
self
.
vb
.
data
qkv_data
=
[
self
.
qw
.
data
,
\
self
.
qb
.
data
,
\
self
.
kw
.
data
,
\
self
.
kb
.
data
,
\
self
.
vw
.
data
,
\
self
.
vb
.
data
]
self
.
qw
.
data
=
self
.
module
.
attention
.
attn_qkvw
.
data
[:
self
.
qw
.
shape
[
0
]]
self
.
qb
.
data
=
self
.
module
.
attention
.
attn_qkvb
.
data
[:
self
.
qw
.
shape
[
0
]]
self
.
kw
.
data
=
self
.
module
.
attention
.
attn_qkvw
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
self
.
kb
.
data
=
self
.
module
.
attention
.
attn_qkvb
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
self
.
vw
.
data
=
self
.
module
.
attention
.
attn_qkvw
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
self
.
vb
.
data
=
self
.
module
.
attention
.
attn_qkvb
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
for
data
in
qkv_data
:
del
data
def
reset_qkv
(
self
):
self
.
qkvw
.
data
[:
self
.
qw
.
shape
[
0
]]
=
self
.
qw
.
data
self
.
qkvw
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
=
self
.
kw
.
data
self
.
qkvw
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
=
self
.
vw
.
data
if
self
.
qkvb
is
not
None
:
self
.
qkvb
.
data
[:
self
.
qw
.
shape
[
0
]]
=
self
.
qb
.
data
self
.
qkvb
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
=
self
.
kb
.
data
self
.
qkvb
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
=
self
.
vb
.
data
qkv_data
=
[
self
.
qw
.
data
,
\
self
.
qb
.
data
if
self
.
qb
is
not
None
else
None
,
\
self
.
kw
.
data
,
\
self
.
kb
.
data
if
self
.
kb
is
not
None
else
None
,
\
self
.
vw
.
data
,
\
self
.
vb
.
data
if
self
.
vb
is
not
None
else
None
]
self
.
qw
.
data
=
self
.
qkvw
.
data
[:
self
.
qw
.
shape
[
0
]]
self
.
kw
.
data
=
self
.
qkvw
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
self
.
vw
.
data
=
self
.
qkvw
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
if
self
.
qkvb
is
not
None
:
self
.
qb
.
data
=
self
.
qkvb
.
data
[:
self
.
qw
.
shape
[
0
]]
self
.
kb
.
data
=
self
.
qkvb
.
data
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
self
.
vb
.
data
=
self
.
qkvb
.
data
[
2
*
self
.
qw
.
shape
[
0
]:]
for
data
in
qkv_data
:
del
data
def
set_params_wo_copy
(
self
,
Z3_enabled
=
False
):
self
.
module
.
mlp
.
attn_nw
=
self
.
attn_nw
self
.
module
.
mlp
.
attn_nb
=
self
.
attn_nb
self
.
module
.
norm_w
=
self
.
input_nw
self
.
module
.
norm_b
=
self
.
input_nb
self
.
module
.
mlp
.
inter_w
=
self
.
_h4h_w
self
.
module
.
mlp
.
inter_b
=
self
.
_h4h_b
self
.
module
.
mlp
.
output_w
=
self
.
_4hh_w
self
.
module
.
mlp
.
output_b
=
self
.
_4hh_b
self
.
module
.
attention
.
attn_ow
=
self
.
dense_w
self
.
module
.
attention
.
attn_ob
=
self
.
dense_b
if
not
Z3_enabled
or
self
.
q_k_v
is
None
:
self
.
module
.
attention
.
attn_qkvw
=
self
.
qkvw
self
.
module
.
attention
.
attn_qkvb
=
self
.
qkvb
if
self
.
q_k_v
is
not
None
:
if
Z3_enabled
:
self
.
module
.
attention
.
attn_qw
=
self
.
qw
self
.
module
.
attention
.
attn_qb
=
self
.
qb
self
.
module
.
attention
.
attn_kw
=
self
.
kw
self
.
module
.
attention
.
attn_kb
=
self
.
kb
self
.
module
.
attention
.
attn_vw
=
self
.
vw
self
.
module
.
attention
.
attn_vb
=
self
.
vb
else
:
self
.
qw
.
data
=
self
.
qkvw
[:
self
.
qw
.
shape
[
0
],
:]
self
.
kw
.
data
=
self
.
qkvw
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
],
:]
self
.
vw
.
data
=
self
.
qkvw
[
self
.
qw
.
shape
[
0
]
*
2
:,
:]
if
self
.
qkvb
is
not
None
:
self
.
qb
.
data
=
self
.
qkvb
[:
self
.
qw
.
shape
[
0
]]
self
.
kb
.
data
=
self
.
qkvb
[
self
.
qw
.
shape
[
0
]:
2
*
self
.
qw
.
shape
[
0
]]
self
.
vb
.
data
=
self
.
qkvb
[
self
.
qw
.
shape
[
0
]
*
2
:]
def
get_lora_params
(
self
):
return
self
.
lora_params
def
get_all_params
(
self
):
if
self
.
q_k_v
is
not
None
:
return
[
self
.
attn_nw
,
self
.
attn_nb
,
self
.
input_nw
,
self
.
input_nb
,
self
.
_h4h_w
,
self
.
_h4h_b
,
self
.
_4hh_w
,
self
.
_4hh_b
,
self
.
qw
,
self
.
qb
,
self
.
kw
,
self
.
kb
,
self
.
vw
,
self
.
vb
,
self
.
dense_w
,
self
.
dense_b
]
else
:
return
[
self
.
attn_nw
,
self
.
attn_nb
,
self
.
input_nw
,
self
.
input_nb
,
self
.
_h4h_w
,
self
.
_h4h_b
,
self
.
_4hh_w
,
self
.
_4hh_b
,
self
.
qkvw
,
self
.
qkvb
,
self
.
dense_w
,
self
.
dense_b
]
deepspeed/module_inject/containers/base_moe.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
# Create a container object to save model-specific tensors using the policy file above.
from
.base
import
*
...
...
@@ -8,6 +11,7 @@ from deepspeed.accelerator import get_accelerator
class
BaseTransformerMoEContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
# Call the init function of the parent class to initialize the tensors and configs from parent class
super
().
__init__
(
**
kwargs
)
...
...
@@ -16,9 +20,7 @@ class BaseTransformerMoEContainer(BaseTransformerContainer):
self
.
ep_world_size
=
dist
.
get_world_size
()
self
.
local_ep_size
=
1
if
self
.
num_experts
<
self
.
ep_world_size
else
self
.
num_experts
//
self
.
ep_world_size
self
.
layer_norm_eps
=
self
.
config
.
layer_norm_eps
if
hasattr
(
self
.
config
,
'layer_norm_eps'
)
else
1e-12
,
self
.
layer_norm_eps
=
self
.
config
.
layer_norm_eps
if
hasattr
(
self
.
config
,
'layer_norm_eps'
)
else
1e-12
,
# MoE models will have a list of mlp related tensors
self
.
_h4h_w
=
[]
...
...
@@ -102,40 +104,27 @@ class BaseTransformerMoEContainer(BaseTransformerContainer):
gpu_index
=
dist
.
get_rank
()
for
ep_index
in
range
(
self
.
local_ep_size
):
# mlp inter
self
.
module
.
mlp
[
ep_index
].
inter_w
.
data
=
self
.
_h4h_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
inter_b
.
data
=
self
.
_h4h_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
inter_w
.
data
=
self
.
_h4h_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
inter_b
.
data
=
self
.
_h4h_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
# mlp output
self
.
module
.
mlp
[
ep_index
].
output_w
.
data
=
self
.
_4hh_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
output_b
.
data
=
self
.
_4hh_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
output_w
.
data
=
self
.
_4hh_w
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
mlp
[
ep_index
].
output_b
.
data
=
self
.
_4hh_b
[
gpu_index
*
self
.
local_ep_size
+
ep_index
].
to
(
get_accelerator
().
current_device_name
())
def
copy_data_to_new_module
(
self
):
self
.
module
.
attn_nw
.
data
=
self
.
attn_nw
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
attn_nb
.
data
=
self
.
attn_nb
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
attn_nw
.
data
=
self
.
attn_nw
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
attn_nb
.
data
=
self
.
attn_nb
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
norm_w
.
data
.
copy_
(
self
.
input_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_b
.
data
.
copy_
(
self
.
input_nb
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_w
.
data
.
copy_
(
self
.
input_nw
.
to
(
get_accelerator
().
current_device_name
()))
self
.
module
.
norm_b
.
data
.
copy_
(
self
.
input_nb
.
to
(
get_accelerator
().
current_device_name
()))
if
self
.
config
.
moe
.
type
==
'residual'
:
self
.
module
.
res_mlp
.
inter_w
.
data
=
self
.
_res_h4h_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
inter_b
.
data
=
self
.
_res_h4h_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_w
.
data
=
self
.
_res_4hh_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_b
.
data
=
self
.
_res_4hh_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_coef
.
data
=
self
.
_res_coef
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
inter_w
.
data
=
self
.
_res_h4h_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
inter_b
.
data
=
self
.
_res_h4h_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_w
.
data
=
self
.
_res_4hh_w
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_mlp
.
output_b
.
data
=
self
.
_res_4hh_b
.
to
(
get_accelerator
().
current_device_name
())
self
.
module
.
res_coef
.
data
=
self
.
_res_coef
.
to
(
get_accelerator
().
current_device_name
())
deepspeed/module_inject/containers/bert.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_bert
import
DeepSpeedBERTInference
...
...
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
class
DS_BERTContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -23,6 +27,7 @@ class DS_BERTContainer(BaseTransformerContainer):
class
HFBertLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
False
):
super
().
__init__
(
inference
,
pre_attn_norm
=
False
)
self
.
client_module
=
client_module
...
...
@@ -39,10 +44,18 @@ class HFBertLayerPolicy(TransformerPolicy):
HFBertLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
if
self
.
pre_attn_norm
:
attention_layernorm
=
self
.
client_module
.
PostAttentionLayerNorm
else
:
attention_layernorm
=
self
.
client_module
.
attention
.
output
.
LayerNorm
return
self
.
client_module
.
attention
.
self
.
query
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
self
.
num_attention_heads
self
.
client_module
.
attention
.
self
.
num_attention_heads
,
\
attention_layernorm
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
,
enable_training
=
False
):
qw
=
self
.
client_module
.
attention
.
self
.
query
.
weight
qb
=
self
.
client_module
.
attention
.
self
.
query
.
bias
kw
=
self
.
client_module
.
attention
.
self
.
key
.
weight
...
...
@@ -50,8 +63,8 @@ class HFBertLayerPolicy(TransformerPolicy):
vw
=
self
.
client_module
.
attention
.
self
.
value
.
weight
vb
=
self
.
client_module
.
attention
.
self
.
value
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
False
)
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
enable_training
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
enable_training
)
return
qkvw
,
\
qkvb
,
\
...
...
@@ -79,3 +92,6 @@ class HFBertLayerPolicy(TransformerPolicy):
attention_layernorm
.
bias
,
\
transformer_layernorm
.
weight
,
\
transformer_layernorm
.
bias
def
get_lora_params
(
self
):
return
[]
deepspeed/module_inject/containers/bloom.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
...
...
@@ -7,10 +10,13 @@ from ..policy import TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_get_lora
supported_models
=
{
None
}
class
DS_BloomContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -24,13 +30,9 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
self
.
module
.
config
.
scale_attention
=
self
.
scale_attention
return
self
.
module
def
attention_qkv_mp
(
self
,
mp_replace
):
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvw
,
self
.
qkvw
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvb
,
self
.
qkvb
)
def
attention_qkv_mp
(
self
,
mp_replace
,
reversed_dim
=
False
):
self
.
module
.
attention
.
attn_qkvw
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvw
,
self
.
qkvw
)
self
.
module
.
attention
.
attn_qkvb
=
mp_replace
.
copy
(
self
.
module
.
attention
.
attn_qkvb
,
self
.
qkvb
)
def
load_params
(
self
,
module
,
sd
,
weight_quantizer
,
mp_replace
,
prefix
):
param_names
=
(
...
...
@@ -58,58 +60,39 @@ class DS_BloomContainer(MetaTensorContainer, BaseTransformerContainer):
megatron_v2
=
self
.
policy
.
is_megatron_v2
,
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
2
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
10
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
10
,
12
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
class
BLOOMLayerPolicy
(
TransformerPolicy
):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
,
use_load_prefix
=
True
,
split_qkv
=
False
):
super
().
__init__
(
inference
,
linear_layer
=
True
,
use_load_prefix
=
use_load_prefix
,
split_qkv
=
split_qkv
)
def
__init__
(
self
,
client_module
,
inference
=
True
,
use_load_prefix
=
True
,
split_qkv
=
False
):
super
().
__init__
(
inference
,
linear_layer
=
True
,
use_load_prefix
=
use_load_prefix
,
split_qkv
=
split_qkv
)
self
.
client_module
=
client_module
try
:
import
transformers
BLOOMLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
bloom
.
modeling_bloom
.
BloomBlock
global
supported_models
supported_models
.
update
(
{
transformers
.
models
.
bloom
.
modeling_bloom
.
BloomModel
})
supported_models
.
update
({
transformers
.
models
.
bloom
.
modeling_bloom
.
BloomModel
})
except
Exception
as
e
:
print
(
f
"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception:
{
e
}
"
)
print
(
f
"WARNING! Setting BLOOMLayerPolicy._orig_layer_class to None due to Exception:
{
e
}
"
)
BLOOMLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attention
.
hidden_size
,
\
self
.
client_module
.
self_attention
.
num_heads
self
.
client_module
.
self_attention
.
num_heads
,
\
self
.
client_module
.
input_layernorm
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
,
enable_training
=
False
):
return
self
.
client_module
.
self_attention
.
query_key_value
.
weight
,
\
self
.
client_module
.
self_attention
.
query_key_value
.
bias
,
\
self
.
client_module
.
self_attention
.
dense
.
weight
,
\
...
...
@@ -126,3 +109,14 @@ class BLOOMLayerPolicy(TransformerPolicy):
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
def
get_lora_params
(
self
):
all_lora_params
=
[]
for
p
in
[
self
.
client_module
.
mlp
.
dense_h_to_4h
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
,
\
self
.
client_module
.
self_attention
.
query_key_value
,
\
self
.
client_module
.
self_attention
.
dense
]:
all_lora_params
.
append
(
maybe_get_lora
(
p
))
return
all_lora_params
deepspeed/module_inject/containers/clip.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
...
...
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
class
DS_CLIPContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -21,6 +25,7 @@ class DS_CLIPContainer(BaseTransformerContainer):
class
HFCLIPLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
False
):
super
().
__init__
(
inference
,
pre_attn_norm
=
True
,
scale_attention
=
True
)
self
.
client_module
=
client_module
...
...
@@ -35,7 +40,11 @@ class HFCLIPLayerPolicy(TransformerPolicy):
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attn
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
self_attn
.
num_heads
self
.
client_module
.
self_attn
.
num_heads
,
\
self
.
client_module
.
layer_norm1
.
eps
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
):
qw
=
self
.
client_module
.
self_attn
.
q_proj
.
weight
...
...
@@ -64,3 +73,6 @@ class HFCLIPLayerPolicy(TransformerPolicy):
self
.
client_module
.
layer_norm2
.
bias
,
\
self
.
client_module
.
layer_norm1
.
weight
,
\
self
.
client_module
.
layer_norm1
.
bias
def
get_lora_params
(
self
):
return
[]
deepspeed/module_inject/containers/distil_bert.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_bert
import
DeepSpeedBERTInference
...
...
@@ -8,6 +11,7 @@ from ..policy import TransformerPolicy
class
DS_DistilBERTContainer
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -41,9 +45,13 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attention
.
q_lin
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
n_heads
self
.
client_module
.
attention
.
n_heads
,
\
self
.
client_module
.
sa_layer_norm
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
,
enable_training
=
False
):
qw
=
self
.
client_module
.
attention
.
q_lin
.
weight
qb
=
self
.
client_module
.
attention
.
q_lin
.
bias
kw
=
self
.
client_module
.
attention
.
k_lin
.
weight
...
...
@@ -51,8 +59,8 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
vw
=
self
.
client_module
.
attention
.
v_lin
.
weight
vb
=
self
.
client_module
.
attention
.
v_lin
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
))
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
))
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
)
,
requires_grad
=
enable_training
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
)
,
requires_grad
=
enable_training
)
return
qkvw
,
\
qkvb
,
\
...
...
@@ -73,3 +81,6 @@ class HFDistilBertLayerPolicy(TransformerPolicy):
attention_layernorm
.
bias
,
\
transformer_layernorm
.
weight
,
\
transformer_layernorm
.
bias
def
get_lora_params
(
self
):
return
[]
deepspeed/module_inject/containers/features/__init__.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.megatron
import
MegatronContainer
from
.meta_tensor
import
MetaTensorContainer
deepspeed/module_inject/containers/features/megatron.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import
torch
from
abc
import
ABC
class
MegatronContainer
(
ABC
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
megatron_v2
=
self
.
policy
.
is_megatron_v2
def
transpose_qkv_alignment
(
self
,
x
):
def
_align_qkv_transposed
(
self
,
x
):
attention_head_size
=
x
.
shape
[
-
1
]
//
self
.
num_attention_heads
new_x_shape
=
x
.
size
()[:
-
1
]
+
(
self
.
num_attention_heads
,
attention_head_size
)
x_1
=
x
.
view
(
*
new_x_shape
)
(
q
,
k
,
v
)
=
torch
.
split
(
x_1
,
(
x_1
.
shape
[
-
1
]
//
3
),
dim
=
(
x_1
.
dim
()
-
1
))
if
len
(
q
.
shape
)
>
2
:
return
torch
.
cat
((
q
.
reshape
(
q
.
shape
[
0
],
-
1
),
k
.
reshape
(
q
.
shape
[
0
],
-
1
),
v
.
reshape
(
q
.
shape
[
0
],
-
1
)),
return
torch
.
cat
((
q
.
reshape
(
q
.
shape
[
0
],
-
1
),
k
.
reshape
(
q
.
shape
[
0
],
-
1
),
v
.
reshape
(
q
.
shape
[
0
],
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
)
else
:
return
torch
.
cat
((
q
.
reshape
(
-
1
),
k
.
reshape
(
-
1
),
v
.
reshape
(
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
)
return
torch
.
cat
((
q
.
reshape
(
-
1
),
k
.
reshape
(
-
1
),
v
.
reshape
(
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
)
def
_align_qkv
(
self
,
x
):
attention_head_size
=
x
.
shape
[
0
]
//
self
.
num_attention_heads
new_x_shape
=
(
self
.
num_attention_heads
,
attention_head_size
)
+
x
.
size
()[
1
:]
x_1
=
x
.
view
(
*
new_x_shape
)
div_dim
=
len
(
x_1
.
size
())
-
2
if
len
(
x
.
shape
)
==
2
else
-
1
(
q
,
k
,
v
)
=
torch
.
split
(
x_1
,
(
x_1
.
shape
[
div_dim
]
//
3
),
dim
=
div_dim
)
if
len
(
q
.
shape
)
>
2
:
x
.
data
.
copy_
(
torch
.
cat
((
q
.
reshape
(
-
1
,
q
.
shape
[
-
1
]),
k
.
reshape
(
-
1
,
q
.
shape
[
-
1
]),
v
.
reshape
(
-
1
,
q
.
shape
[
-
1
])),
dim
=
0
).
reshape
(
x
.
shape
))
else
:
x
.
data
.
copy_
(
torch
.
cat
((
q
.
reshape
(
-
1
),
k
.
reshape
(
-
1
),
v
.
reshape
(
-
1
)),
dim
=-
1
).
reshape
(
x
.
shape
))
def
_align_merged_qkv
(
self
):
if
hasattr
(
self
.
qkvw
,
'ds_id'
):
from
deepspeed.runtime.zero
import
GatheredParameters
from
deepspeed.runtime.zero.partition_parameters
import
ZeroParamStatus
param_list
=
[
self
.
qkvw
,
self
.
qkvb
]
non_active_params
=
[
param
for
param
in
param_list
if
(
hasattr
(
param
,
'ds_id'
)
and
\
param
.
ds_status
==
ZeroParamStatus
.
NOT_AVAILABLE
)]
with
GatheredParameters
(
non_active_params
):
self
.
_align_qkv
(
self
.
qkvw
)
self
.
_align_qkv
(
self
.
qkvb
)
else
:
self
.
_align_qkv
(
self
.
qkvw
)
self
.
_align_qkv
(
self
.
qkvb
)
def
_partition_qkv
(
self
,
x
):
q_k_v
=
torch
.
split
(
x
,
(
x
.
shape
[
0
]
//
3
),
dim
=
0
)
attention_head_size
=
q_k_v
[
0
].
shape
[
0
]
//
self
.
num_attention_heads
new_x_shape
=
(
self
.
num_attention_heads
,
attention_head_size
)
+
x
.
size
()[
1
:]
q
,
k
,
v
=
[
data
.
view
(
*
new_x_shape
)
for
data
in
q_k_v
]
if
len
(
q
.
shape
)
>
2
:
x
.
data
.
copy_
(
torch
.
cat
((
q
,
k
,
v
),
dim
=-
2
).
reshape
(
-
1
,
q
.
shape
[
-
1
]))
else
:
x
.
data
.
copy_
(
torch
.
cat
((
q
,
k
,
v
),
dim
=-
1
).
reshape
(
-
1
))
def
_partition_merged_qkv
(
self
):
if
hasattr
(
self
.
qkvw
,
'ds_id'
):
from
deepspeed.runtime.zero
import
GatheredParameters
from
deepspeed.runtime.zero.partition_parameters
import
ZeroParamStatus
param_list
=
[
self
.
qkvw
,
self
.
qkvb
]
non_active_params
=
[
param
for
param
in
param_list
if
(
hasattr
(
param
,
'ds_id'
)
and
\
param
.
ds_status
==
ZeroParamStatus
.
NOT_AVAILABLE
)]
with
GatheredParameters
(
non_active_params
):
self
.
_partition_qkv
(
self
.
qkvw
)
self
.
_partition_qkv
(
self
.
qkvb
)
else
:
self
.
_partition_qkv
(
self
.
qkvw
)
self
.
_partition_qkv
(
self
.
qkvb
)
def
transpose
(
self
):
super
().
transpose
()
if
self
.
megatron_v2
:
self
.
qkvw
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
transpose_qkv_alignment
(
self
.
qkvw
).
contiguous
())
self
.
qkvb
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
transpose_qkv_alignment
(
self
.
qkvb
).
contiguous
())
self
.
qkvw
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
_align_qkv_transposed
(
self
.
qkvw
).
contiguous
())
self
.
qkvb
=
torch
.
nn
.
parameter
.
Parameter
(
self
.
_align_qkv_transposed
(
self
.
qkvb
).
contiguous
())
deepspeed/module_inject/containers/features/meta_tensor.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
abc
import
ABC
,
abstractmethod
class
MetaTensorContainer
(
ABC
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
self
.
is_meta
=
False
self
.
ckpt_load_enabled
=
True
def
initialize_tensors
(
self
):
super
().
initialize_tensors
()
def
initialize_tensors
(
self
,
enable_training
=
False
):
super
().
initialize_tensors
(
enable_training
=
enable_training
)
self
.
is_meta
=
self
.
qkvw
.
is_meta
def
apply_tensor_parallelism
(
self
,
mp_replace
):
def
apply_tensor_parallelism
(
self
,
mp_replace
=
None
,
mp_group
=
None
,
tp_size
=
None
):
if
self
.
is_meta
:
if
self
.
qkvb
is
None
:
self
.
module
.
attention
.
attn_qkvb
=
None
if
self
.
dense_b
is
None
:
self
.
module
.
attention
.
attn_ob
=
None
else
:
super
().
apply_tensor_parallelism
(
mp_replace
)
super
().
apply_tensor_parallelism
(
mp_replace
,
mp_group
,
tp_size
)
def
copy_data_to_new_module
(
self
):
if
self
.
is_meta
:
...
...
@@ -53,6 +57,5 @@ class MetaTensorContainer(ABC):
of q, k, and v are stored together and needs to split in the
DeepSpeed-Inference API.
"""
raise
NotImplementedError
(
"A load_params() function must be defined in the model container
\
raise
NotImplementedError
(
"A load_params() function must be defined in the model container
\
when inheriting the MetaTensorContainer feature"
)
deepspeed/module_inject/containers/gpt2.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
deepspeed.model_implementations.transformers.ds_gpt
import
DeepSpeedGPTInference
...
...
@@ -6,6 +9,7 @@ from ..policy import TransformerPolicy
class
DS_GPT2Container
(
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -33,9 +37,13 @@ class HFGPT2LayerPolicy(TransformerPolicy):
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
embed_dim
,
\
self
.
client_module
.
attn
.
num_heads
self
.
client_module
.
attn
.
num_heads
,
\
self
.
client_module
.
ln_1
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
,
enable_training
=
False
):
return
self
.
client_module
.
attn
.
c_attn
.
weight
,
\
self
.
client_module
.
attn
.
c_attn
.
bias
,
\
self
.
client_module
.
attn
.
c_proj
.
weight
,
\
...
...
@@ -52,3 +60,6 @@ class HFGPT2LayerPolicy(TransformerPolicy):
self
.
client_module
.
ln_2
.
bias
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
def
get_lora_params
(
self
):
return
[]
deepspeed/module_inject/containers/gptj.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
...
...
@@ -10,8 +13,11 @@ from ..policy import transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
from
..policy
import
maybe_get_lora
class
DS_GPTJContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -36,36 +42,20 @@ class DS_GPTJContainer(MetaTensorContainer, BaseTransformerContainer):
'ln_1.weight'
,
\
'ln_1.bias'
)
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
4
):
maybe_copy
(
module
.
attention
,
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
8
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
8
,
10
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
+
2
],
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
+
2
],
prefix
+
param_names
[
i
])
...
...
@@ -82,15 +72,24 @@ class HFGPTJLayerPolicy(TransformerPolicy):
HFGPTJLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attn
.
num_attention_heads
return
self
.
client_module
.
attn
.
embed_dim
,
\
self
.
client_module
.
attn
.
num_attention_heads
,
\
self
.
client_module
.
ln_1
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
self
.
client_module
.
attn
.
q_proj
.
weight
,
\
None
,
\
self
.
client_module
.
attn
.
k_proj
.
weight
,
\
None
,
\
self
.
client_module
.
attn
.
v_proj
.
weight
,
\
None
def
attention
(
self
,
enable_training
=
False
):
qw
=
self
.
client_module
.
attn
.
q_proj
.
weight
kw
=
self
.
client_module
.
attn
.
k_proj
.
weight
vw
=
self
.
client_module
.
attn
.
v_proj
.
weight
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
enable_training
)
return
qkvw
,
\
None
,
\
...
...
@@ -108,3 +107,16 @@ class HFGPTJLayerPolicy(TransformerPolicy):
None
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
def
get_lora_params
(
self
):
all_lora_params
=
[]
for
p
in
[
self
.
client_module
.
mlp
.
fc_in
,
\
self
.
client_module
.
mlp
.
fc_out
,
\
self
.
client_module
.
attn
.
q_proj
,
\
self
.
client_module
.
attn
.
k_proj
,
\
self
.
client_module
.
attn
.
v_proj
,
\
self
.
client_module
.
attn
.
out_proj
,
\
]:
all_lora_params
.
append
(
maybe_get_lora
(
p
))
return
all_lora_params
deepspeed/module_inject/containers/gptneo.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
...
...
@@ -10,8 +13,11 @@ from ..policy import transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
from
..policy
import
maybe_get_lora
class
DS_GPTNEOContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -39,40 +45,25 @@ class DS_GPTNEOContainer(MetaTensorContainer, BaseTransformerContainer):
'ln_1.weight'
,
\
'ln_1.bias'
)
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
5
):
maybe_copy
(
module
.
attention
,
maybe_copy_qkv
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
'attn_qkvw'
,
[
prefix
+
param_names
[
0
],
prefix
+
param_names
[
1
],
prefix
+
param_names
[
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
3
,
5
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
5
,
11
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
for
i
in
range
(
11
,
13
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
1
],
prefix
+
param_names
[
i
])
class
HFGPTNEOLayerPolicy
(
TransformerPolicy
):
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
,
scale_attention
=
False
)
self
.
client_module
=
client_module
...
...
@@ -83,15 +74,24 @@ class HFGPTNEOLayerPolicy(TransformerPolicy):
HFGPTNEOLayerPolicy
.
_orig_layer_class
=
None
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attn
.
attention
.
q_proj
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attn
.
attention
.
num_heads
return
self
.
client_module
.
attn
.
attention
.
embed_dim
,
\
self
.
client_module
.
attn
.
attention
.
num_heads
,
\
self
.
client_module
.
ln_1
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
self
.
client_module
.
attn
.
attention
.
q_proj
.
weight
,
\
None
,
\
self
.
client_module
.
attn
.
attention
.
k_proj
.
weight
,
\
None
,
\
self
.
client_module
.
attn
.
attention
.
v_proj
.
weight
,
\
None
def
attention
(
self
,
enable_training
=
False
):
qw
=
self
.
client_module
.
attn
.
attention
.
q_proj
.
weight
kw
=
self
.
client_module
.
attn
.
attention
.
k_proj
.
weight
vw
=
self
.
client_module
.
attn
.
attention
.
v_proj
.
weight
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
enable_training
)
return
qkvw
,
\
None
,
\
...
...
@@ -109,3 +109,16 @@ class HFGPTNEOLayerPolicy(TransformerPolicy):
self
.
client_module
.
ln_2
.
bias
,
\
self
.
client_module
.
ln_1
.
weight
,
\
self
.
client_module
.
ln_1
.
bias
def
get_lora_params
(
self
):
all_lora_params
=
[]
for
p
in
[
self
.
client_module
.
mlp
.
c_fc
,
\
self
.
client_module
.
mlp
.
c_proj
,
\
self
.
client_module
.
attn
.
attention
.
q_proj
,
\
self
.
client_module
.
attn
.
attention
.
k_proj
,
\
self
.
client_module
.
attn
.
attention
.
v_proj
,
\
self
.
client_module
.
attn
.
attention
.
out_proj
,
\
]:
all_lora_params
.
append
(
maybe_get_lora
(
p
))
return
all_lora_params
deepspeed/module_inject/containers/gptneox.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
...
...
@@ -10,10 +13,11 @@ from ..policy import transformer_param_names
from
..policy
import
maybe_copy
from
packaging
import
version
as
pkg_version
from
..policy
import
maybe_get_lora
class
DS_GPTNEOXContainer
(
MetaTensorContainer
,
MegatronContainer
,
BaseTransformerContainer
):
class
DS_GPTNEOXContainer
(
MetaTensorContainer
,
MegatronContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -57,26 +61,13 @@ class DS_GPTNEOXContainer(MetaTensorContainer,
split_qkv
=
self
.
policy
.
split_qkv
,
heads
=
self
.
policy
.
client_module
.
attention
.
num_attention_heads
)
for
i
in
range
(
2
,
4
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
4
,
10
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
for
i
in
range
(
10
,
12
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
],
prefix
+
param_names
[
i
])
class
GPTNEOXLayerPolicy
(
TransformerPolicy
):
...
...
@@ -102,10 +93,14 @@ class GPTNEOXLayerPolicy(TransformerPolicy):
else
:
attention
=
self
.
client_module
.
self_attention
return
self
.
client_module
.
attention
.
query_key_value
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
num_attention_heads
return
self
.
client_module
.
attention
.
hidden_size
,
\
self
.
client_module
.
attention
.
num_attention_heads
,
\
self
.
client_module
.
input_layernorm
.
eps
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
):
def
attention
(
self
,
enable_training
=
False
):
if
GPTNEOXLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
else
:
...
...
@@ -127,3 +122,19 @@ class GPTNEOXLayerPolicy(TransformerPolicy):
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
def
get_lora_params
(
self
):
if
GPTNEOXLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
else
:
attention
=
self
.
client_module
.
self_attention
all_lora_params
=
[]
for
p
in
[
self
.
client_module
.
mlp
.
dense_h_to_4h
,
\
self
.
client_module
.
mlp
.
dense_4h_to_h
,
\
attention
.
query_key_value
,
\
attention
.
dense
]:
all_lora_params
.
append
(
maybe_get_lora
(
p
))
return
all_lora_params
deepspeed/module_inject/containers/megatron_gpt.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.megatron
import
MegatronContainer
...
...
@@ -9,6 +12,7 @@ from packaging import version as pkg_version
class
DS_MegatronGPTContainer
(
MegatronContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -36,9 +40,7 @@ class MegatronLayerPolicy(TransformerPolicy):
use_mup
=
False
def
__init__
(
self
,
client_module
,
inference
=
True
):
super
().
__init__
(
inference
,
megatron_v2
=
MegatronLayerPolicy
.
megatron_v2
,
use_mup
=
MegatronLayerPolicy
.
use_mup
)
super
().
__init__
(
inference
,
megatron_v2
=
MegatronLayerPolicy
.
megatron_v2
,
use_mup
=
MegatronLayerPolicy
.
use_mup
)
self
.
client_module
=
client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
...
...
@@ -54,9 +56,13 @@ class MegatronLayerPolicy(TransformerPolicy):
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
attention
.
query_key_value
.
weight
.
shape
[
1
],
\
self
.
client_module
.
attention
.
num_attention_heads
self
.
client_module
.
attention
.
num_attention_heads
,
\
self
.
client_module
.
input_layernorm
.
eps
def
attention
(
self
):
def
get_q_k_v
(
self
):
return
None
def
attention
(
self
,
enable_training
=
False
):
if
self
.
inference
:
if
MegatronLayerPolicy
.
version
==
0
:
attention
=
self
.
client_module
.
attention
...
...
@@ -104,3 +110,6 @@ class MegatronLayerPolicy(TransformerPolicy):
self
.
client_module
.
post_attention_layernorm
.
bias
,
\
self
.
client_module
.
input_layernorm
.
weight
,
\
self
.
client_module
.
input_layernorm
.
bias
def
get_lora_params
(
self
):
return
[]
deepspeed/module_inject/containers/megatron_gpt_moe.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.base_moe
import
*
...
...
@@ -10,6 +13,7 @@ from packaging import version as pkg_version
class
DS_MegatronGPTMoEContainer
(
MegatronContainer
,
BaseTransformerMoEContainer
):
def
__init__
(
self
,
policy
,
config
,
model_config
,
layer_id
):
super
().
__init__
(
policy
,
config
,
model_config
,
layer_id
)
...
...
deepspeed/module_inject/containers/opt.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
.base
import
*
from
.features.meta_tensor
import
MetaTensorContainer
...
...
@@ -9,10 +12,12 @@ from ..policy import TransformerPolicy
from
..policy
import
transformer_param_names
from
..policy
import
maybe_copy
from
..policy
import
maybe_copy_qkv
from
..policy
import
maybe_get_lora
from
deepspeed.utils.types
import
ActivationFuncType
class
DS_OPTContainer
(
MetaTensorContainer
,
BaseTransformerContainer
):
def
__init__
(
self
,
**
kwargs
):
super
().
__init__
(
**
kwargs
)
...
...
@@ -50,32 +55,16 @@ class DS_OPTContainer(MetaTensorContainer, BaseTransformerContainer):
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
//
3
],
[
prefix
+
param_names
[
i
],
prefix
+
param_names
[
i
+
1
],
prefix
+
param_names
[
i
+
2
]
],
[
prefix
+
param_names
[
i
],
prefix
+
param_names
[
i
+
1
],
prefix
+
param_names
[
i
+
2
]],
split_qkv
=
self
.
policy
.
split_qkv
)
for
i
in
range
(
6
,
8
):
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
maybe_copy
(
module
.
attention
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
for
i
in
range
(
8
,
14
):
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
maybe_copy
(
module
.
mlp
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
for
i
in
range
(
14
,
16
):
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
maybe_copy
(
module
,
sd
,
weight_quantizer
,
mp_replace
,
transformer_param_names
[
i
-
4
],
prefix
+
param_names
[
i
])
...
...
@@ -83,27 +72,40 @@ class HFOPTLayerPolicy(TransformerPolicy):
_orig_layer_class
=
None
def
__init__
(
self
,
client_module
,
inference
=
True
,
use_load_prefix
=
True
):
super
().
__init__
(
inference
,
linear_layer
=
True
,
mlp_act_func_type
=
ActivationFuncType
.
ReLU
,
pre_attn_norm
=
True
,
use_load_prefix
=
use_load_prefix
)
super
().
__init__
(
inference
,
linear_layer
=
True
,
pre_attn_norm
=
True
,
use_load_prefix
=
use_load_prefix
)
self
.
client_module
=
client_module
try
:
import
transformers
HFOPTLayerPolicy
.
_orig_layer_class
=
transformers
.
models
.
opt
.
modeling_opt
.
OPTDecoderLayer
if
isinstance
(
TransformerPolicy
.
hf_model_config
,
transformers
.
models
.
opt
.
configuration_opt
.
OPTConfig
):
self
.
pre_attn_norm
=
TransformerPolicy
.
hf_model_config
.
do_layer_norm_before
except
:
HFOPTLayerPolicy
.
_orig_layer_class
=
None
if
hasattr
(
TransformerPolicy
,
"hf_model_config"
)
and
hasattr
(
TransformerPolicy
.
hf_model_config
,
"activation_function"
):
if
TransformerPolicy
.
hf_model_config
.
activation_function
==
"relu"
:
self
.
mlp_act_func_type
=
ActivationFuncType
.
ReLU
elif
TransformerPolicy
.
hf_model_config
.
activation_function
in
[
"gelu"
,
"gelu_new"
]:
self
.
mlp_act_func_type
=
ActivationFuncType
.
GELU
else
:
raise
ValueError
(
"Unsupported activation function: {}"
.
format
(
TransformerPolicy
.
hf_model_config
.
activation_function
))
else
:
self
.
mlp_act_func_type
=
ActivationFuncType
.
ReLU
# default
def
get_hidden_heads
(
self
):
return
self
.
client_module
.
self_attn
.
embed_dim
,
\
self
.
client_module
.
self_attn
.
num_heads
def
attention
(
self
):
self
.
client_module
.
self_attn
.
num_heads
,
\
self
.
client_module
.
self_attn_layer_norm
.
eps
def
get_q_k_v
(
self
):
return
self
.
client_module
.
self_attn
.
q_proj
.
weight
,
\
self
.
client_module
.
self_attn
.
q_proj
.
bias
,
\
self
.
client_module
.
self_attn
.
k_proj
.
weight
,
\
self
.
client_module
.
self_attn
.
k_proj
.
bias
,
\
self
.
client_module
.
self_attn
.
v_proj
.
weight
,
\
self
.
client_module
.
self_attn
.
v_proj
.
bias
def
attention
(
self
,
enable_training
=
False
):
qw
=
self
.
client_module
.
self_attn
.
q_proj
.
weight
qb
=
self
.
client_module
.
self_attn
.
q_proj
.
bias
...
...
@@ -113,9 +115,8 @@ class HFOPTLayerPolicy(TransformerPolicy):
vw
=
self
.
client_module
.
self_attn
.
v_proj
.
weight
vb
=
self
.
client_module
.
self_attn
.
v_proj
.
bias
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
False
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
False
)
qkvw
=
Parameter
(
torch
.
cat
((
qw
,
kw
,
vw
),
dim
=
0
),
requires_grad
=
enable_training
)
qkvb
=
Parameter
(
torch
.
cat
((
qb
,
kb
,
vb
),
dim
=
0
),
requires_grad
=
enable_training
)
return
qkvw
,
\
qkvb
,
\
self
.
client_module
.
self_attn
.
out_proj
.
weight
,
\
...
...
@@ -132,3 +133,16 @@ class HFOPTLayerPolicy(TransformerPolicy):
self
.
client_module
.
final_layer_norm
.
bias
,
\
self
.
client_module
.
self_attn_layer_norm
.
weight
,
\
self
.
client_module
.
self_attn_layer_norm
.
bias
def
get_lora_params
(
self
):
all_lora_params
=
[]
for
p
in
[
self
.
client_module
.
fc1
,
\
self
.
client_module
.
fc2
,
\
self
.
client_module
.
self_attn
.
q_proj
,
\
self
.
client_module
.
self_attn
.
k_proj
,
\
self
.
client_module
.
self_attn
.
v_proj
,
\
self
.
client_module
.
self_attn
.
out_proj
,
\
]:
all_lora_params
.
append
(
maybe_get_lora
(
p
))
return
all_lora_params
deepspeed/module_inject/containers/unet.py
View file @
5bcc463d
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import
torch
from
torch.nn.parameter
import
Parameter
...
...
@@ -9,6 +11,7 @@ from ...model_implementations.diffusers.unet import DSUNet
class
UNetPolicy
(
DSPolicy
):
def
__init__
(
self
):
super
().
__init__
()
try
:
...
...
deepspeed/module_inject/containers/vae.py
View file @
5bcc463d
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from
..policy
import
DSPolicy
from
...model_implementations.diffusers.vae
import
DSVAE
class
VAEPolicy
(
DSPolicy
):
def
__init__
(
self
):
super
().
__init__
()
try
:
...
...
deepspeed/module_inject/inject.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import
copy
import
torch
from
deepspeed.ops.transformer
import
DeepSpeedTransformerLayer
,
DeepSpeedTransformerConfig
def
module_inject
(
layer_obj
,
model
,
config
,
micro_batch_size
,
max_seq_length
,
seed
,
preln
,
fp16
=
True
):
def
module_inject
(
layer_obj
,
model
,
config
,
micro_batch_size
,
max_seq_length
,
seed
,
preln
,
fp16
=
True
):
for
name
,
child
in
model
.
named_children
():
if
isinstance
(
child
,
layer_obj
):
print
(
'REPLACING BertLayer'
)
cuda_config
=
DeepSpeedTransformerConfig
(
batch_size
=
micro_batch_size
,
max_seq_length
=
max_seq_length
,
hidden_size
=
config
.
hidden_size
,
heads
=
config
.
num_attention_heads
,
attn_dropout_ratio
=
config
.
attention_probs_dropout_prob
,
hidden_dropout_ratio
=
config
.
hidden_dropout_prob
,
num_hidden_layers
=
config
.
num_hidden_layers
,
initializer_range
=
config
.
initializer_range
,
seed
=
seed
,
fp16
=
fp16
,
pre_layer_norm
=
preln
)
cuda_config
=
DeepSpeedTransformerConfig
(
batch_size
=
micro_batch_size
,
max_seq_length
=
max_seq_length
,
hidden_size
=
config
.
hidden_size
,
heads
=
config
.
num_attention_heads
,
attn_dropout_ratio
=
config
.
attention_probs_dropout_prob
,
hidden_dropout_ratio
=
config
.
hidden_dropout_prob
,
num_hidden_layers
=
config
.
num_hidden_layers
,
initializer_range
=
config
.
initializer_range
,
seed
=
seed
,
fp16
=
fp16
,
pre_layer_norm
=
preln
)
new_module
=
DeepSpeedTransformerLayer
(
cuda_config
)
...
...
@@ -71,14 +66,7 @@ def module_inject(layer_obj,
setattr
(
model
,
name
,
copy
.
deepcopy
(
new_module
))
else
:
module_inject
(
layer_obj
,
child
,
config
,
micro_batch_size
,
max_seq_length
,
seed
,
preln
,
fp16
)
module_inject
(
layer_obj
,
child
,
config
,
micro_batch_size
,
max_seq_length
,
seed
,
preln
,
fp16
)
return
model
...
...
deepspeed/module_inject/layers.py
View file @
5bcc463d
'''Copyright The Microsoft DeepSpeed Team'''
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import
torch
from
deepspeed
import
comm
as
dist
...
...
@@ -10,6 +13,7 @@ from deepspeed.accelerator import get_accelerator
class
LinearAllreduce
(
nn
.
Module
):
def
__init__
(
self
,
weight
,
bias
=
None
,
mp_group
=
None
):
super
(
LinearAllreduce
,
self
).
__init__
()
self
.
weight
=
weight
...
...
@@ -26,6 +30,7 @@ class LinearAllreduce(nn.Module):
class
LinearLayer
(
nn
.
Module
):
def
__init__
(
self
,
weight_shape
=
None
,
dtype
=
torch
.
half
,
weight
=
None
,
bias
=
None
):
super
(
LinearLayer
,
self
).
__init__
()
if
weight
is
not
None
:
...
...
@@ -33,9 +38,7 @@ class LinearLayer(nn.Module):
self
.
bias
=
bias
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
weight_shape
,
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
torch
.
empty
(
weight_shape
,
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
self
.
bias
=
Parameter
(
torch
.
empty
(
weight_shape
[
0
],
...
...
@@ -51,26 +54,35 @@ class LinearLayer(nn.Module):
class
Normalize
(
nn
.
Module
):
def
__init__
(
self
,
dim
,
dtype
=
torch
.
float
,
eps
=
1e-5
):
def
__init__
(
self
,
dim
=
None
,
dtype
=
torch
.
float
,
eps
=
1e-5
,
weight
=
None
,
bias
=
None
):
super
(
Normalize
,
self
).
__init__
()
self
.
norm
=
nn
.
LayerNorm
(
dim
,
eps
=
eps
).
to
(
dtype
).
to
(
get_accelerator
().
current_device_name
())
self
.
weight
=
self
.
norm
.
weight
self
.
bias
=
self
.
norm
.
bias
if
weight
is
not
None
:
self
.
weight
=
weight
self
.
bias
=
bias
else
:
self
.
norm
=
nn
.
LayerNorm
(
dim
,
eps
=
eps
).
to
(
dtype
).
to
(
get_accelerator
().
current_device_name
())
self
.
weight
=
self
.
norm
.
weight
self
.
bias
=
self
.
norm
.
bias
self
.
eps
=
eps
def
forward
(
self
,
input
):
return
self
.
norm
(
input
)
return
nn
.
functional
.
layer_norm
(
input
,
input
.
shape
[
-
1
:],
self
.
weight
,
self
.
bias
,
eps
=
self
.
eps
)
class
EmbeddingLayer
(
nn
.
Module
):
def
__init__
(
self
,
weight_shape
,
dtype
=
torch
.
half
):
def
__init__
(
self
,
weight_shape
=
None
,
dtype
=
torch
.
half
,
weight
=
None
,
bias
=
None
):
super
(
EmbeddingLayer
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
empty
(
weight_shape
[
0
],
weight_shape
[
1
],
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
if
weight
is
None
:
self
.
weight
=
Parameter
(
torch
.
empty
(
weight_shape
[
0
],
weight_shape
[
1
],
dtype
=
dtype
,
device
=
get_accelerator
().
current_device_name
()))
else
:
self
.
weight
=
weight
def
forward
(
self
,
input
):
return
F
.
embedding
(
input
,
self
.
weight
)
...
...
@@ -80,20 +92,19 @@ class OPTEmbedding(EmbeddingLayer):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def
__init__
(
self
,
weight_shape
):
def
__init__
(
self
,
weight_shape
=
None
,
weight
=
None
,
bias
=
None
):
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self
.
offset
=
2
super
().
__init__
(
weight_shape
)
super
().
__init__
(
weight_shape
,
weight
=
weight
)
def
forward
(
self
,
attention_mask
:
torch
.
LongTensor
,
past_key_values_length
:
int
=
0
):
"""`input_ids_shape` is expected to be [bsz x seqlen]."""
attention_mask
=
attention_mask
.
long
()
# create positions depending on attention_mask
positions
=
(
torch
.
cumsum
(
attention_mask
,
dim
=
1
).
type_as
(
attention_mask
)
*
attention_mask
).
long
()
-
1
positions
=
(
torch
.
cumsum
(
attention_mask
,
dim
=
1
).
type_as
(
attention_mask
)
*
attention_mask
).
long
()
-
1
# cut positions if `past_key_values_length` is > 0
positions
=
positions
[:,
past_key_values_length
:]
...
...
Prev
1
…
9
10
11
12
13
14
15
16
17
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment