Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
ffbf989f
Unverified
Commit
ffbf989f
authored
Sep 20, 2023
by
Sourab Mangrulkar
Committed by
GitHub
Sep 20, 2023
Browse files
DeepSpeed ZeRO-3 handling when resizing embedding layers (#26259)
* fix failing deepspeed slow tests * fixes
parent
39df4eca
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
53 additions
and
60 deletions
+53
-60
src/transformers/modeling_utils.py
src/transformers/modeling_utils.py
+53
-60
No files found.
src/transformers/modeling_utils.py
View file @
ffbf989f
...
...
@@ -1550,7 +1550,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
else
:
old_num_tokens
,
old_embedding_dim
=
old_embeddings
.
weight
.
size
()
if
old_num_tokens
==
new_num_tokens
:
if
old_num_tokens
==
new_num_tokens
and
not
is_deepspeed_zero3_enabled
()
:
return
old_embeddings
if
not
isinstance
(
old_embeddings
,
nn
.
Embedding
):
...
...
@@ -1560,13 +1560,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
f
"
{
nn
.
Embedding
}
."
)
# numbers of tokens to copy
n
=
min
(
old_num_tokens
,
new_num_tokens
)
if
is_deepspeed_zero3_enabled
():
import
deepspeed
with
deepspeed
.
zero
.
Init
(
config_dict_or_path
=
deepspeed_config
()):
# Build new embeddings
# When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
# because the shape of the new embedding layer is used across various modeling files
# as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
# to errors when training.
new_embeddings
=
nn
.
Embedding
(
new_num_tokens
,
old_embedding_dim
,
...
...
@@ -1574,26 +1573,21 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
dtype
=
old_embeddings
.
weight
.
dtype
,
)
params
=
[
old_embeddings
.
weight
,
new_embeddings
.
weight
]
with
deepspeed
.
zero
.
GatheredParameters
(
params
,
modifier_rank
=
0
):
# initialize all new embeddings (in particular added tokens)
self
.
_init_weights
(
new_embeddings
)
# Copy token embeddings from the previous weights
new_embeddings
.
weight
.
data
[:
n
,
:]
=
old_embeddings
.
weight
.
data
[:
n
,
:]
else
:
# Build new embeddings
new_embeddings
=
nn
.
Embedding
(
new_num_tokens
,
old_embedding_dim
,
device
=
old_embeddings
.
weight
.
device
,
dtype
=
old_embeddings
.
weight
.
dtype
,
)
# initialize all new embeddings (in particular added tokens)
self
.
_init_weights
(
new_embedding
s
)
# numbers of tokens to copy
n
=
min
(
old_num_tokens
,
new_num_token
s
)
# Copy token embeddings from the previous weights
if
is_deepspeed_zero3_enabled
():
import
deepspeed
params
=
[
old_embeddings
.
weight
,
new_embeddings
.
weight
]
with
deepspeed
.
zero
.
GatheredParameters
(
params
,
modifier_rank
=
0
):
new_embeddings
.
weight
.
data
[:
n
,
:]
=
old_embeddings
.
weight
.
data
[:
n
,
:]
else
:
new_embeddings
.
weight
.
data
[:
n
,
:]
=
old_embeddings
.
weight
.
data
[:
n
,
:]
return
new_embeddings
...
...
@@ -1636,7 +1630,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
old_lm_head
.
weight
.
size
()
if
not
transposed
else
old_lm_head
.
weight
.
t
().
size
()
)
if
old_num_tokens
==
new_num_tokens
:
if
old_num_tokens
==
new_num_tokens
and
not
is_deepspeed_zero3_enabled
()
:
return
old_lm_head
if
not
isinstance
(
old_lm_head
,
nn
.
Linear
):
...
...
@@ -1650,39 +1644,40 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
new_lm_head_shape
=
(
old_lm_head_dim
,
new_num_tokens
)
if
not
transposed
else
(
new_num_tokens
,
old_lm_head_dim
)
has_new_lm_head_bias
=
old_lm_head
.
bias
is
not
None
num_tokens_to_copy
=
min
(
old_num_tokens
,
new_num_tokens
)
# XXX: put the long block of code in a wrapper
if
is_deepspeed_zero3_enabled
():
import
deepspeed
with
deepspeed
.
zero
.
Init
(
config_dict_or_path
=
deepspeed_config
()):
# When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init
# because the shape of the new embedding layer is used across various modeling files
# as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading
# to errors when training.
new_lm_head
=
nn
.
Linear
(
*
new_lm_head_shape
,
bias
=
has_new_lm_head_bias
,
device
=
old_lm_head
.
weight
.
device
,
dtype
=
old_lm_head
.
weight
.
dtype
,
)
params
=
[
old_lm_head
.
weight
,
old_lm_head
.
bias
,
new_lm_head
.
weight
,
new_lm_head
.
bias
]
with
deepspeed
.
zero
.
GatheredParameters
(
params
,
modifier_rank
=
0
):
# initialize new lm head (in particular added tokens)
self
.
_init_weights
(
new_lm_head
)
# Copy old lm head weights to new lm head
if
not
transposed
:
new_lm_head
.
weight
.
data
[:
num_tokens_to_copy
,
:]
=
old_lm_head
.
weight
.
data
[:
num_tokens_to_copy
,
:]
else
:
new_lm_head
.
weight
.
data
[:,
:
num_tokens_to_copy
]
=
old_lm_head
.
weight
.
data
[:,
:
num_tokens_to_copy
]
# Copy bias weights to new lm head
if
has_new_lm_head_bias
:
new_lm_head
.
bias
.
data
[:
num_tokens_to_copy
]
=
old_lm_head
.
bias
.
data
[:
num_tokens_to_copy
]
num_tokens_to_copy
=
min
(
old_num_tokens
,
new_num_tokens
)
if
is_deepspeed_zero3_enabled
():
import
deepspeed
params
=
[
old_lm_head
.
weight
,
old_lm_head
.
bias
,
new_lm_head
.
weight
,
new_lm_head
.
bias
]
with
deepspeed
.
zero
.
GatheredParameters
(
params
,
modifier_rank
=
0
):
self
.
_copy_lm_head_original_to_resized
(
new_lm_head
,
old_lm_head
,
num_tokens_to_copy
,
transposed
,
has_new_lm_head_bias
)
else
:
new_lm_head
=
nn
.
Linear
(
*
new_lm_head_shape
,
bias
=
has_new_lm_head_bias
,
device
=
old_lm_head
.
weight
.
device
,
dtype
=
old_lm_head
.
weight
.
dtype
,
self
.
_copy_lm_head_original_to_resized
(
new_lm_head
,
old_lm_head
,
num_tokens_to_copy
,
transposed
,
has_new_lm_head_bias
)
self
.
_init_weights
(
new_lm_head
)
return
new_lm_head
def
_copy_lm_head_original_to_resized
(
self
,
new_lm_head
,
old_lm_head
,
num_tokens_to_copy
,
transposed
,
has_new_lm_head_bias
):
# Copy old lm head weights to new lm head
if
not
transposed
:
new_lm_head
.
weight
.
data
[:
num_tokens_to_copy
,
:]
=
old_lm_head
.
weight
.
data
[:
num_tokens_to_copy
,
:]
...
...
@@ -1693,8 +1688,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
if
has_new_lm_head_bias
:
new_lm_head
.
bias
.
data
[:
num_tokens_to_copy
]
=
old_lm_head
.
bias
.
data
[:
num_tokens_to_copy
]
return
new_lm_head
def
resize_position_embeddings
(
self
,
new_num_position_embeddings
:
int
):
raise
NotImplementedError
(
f
"`resize_position_embeddings` is not implemented for
{
self
.
__class__
}
`. To implement it, you should "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment