Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
06886d5a
Unverified
Commit
06886d5a
authored
Nov 03, 2022
by
Sylvain Gugger
Committed by
GitHub
Nov 03, 2022
Browse files
Only resize embeddings when necessary (#20043)
* Only resize embeddings when necessary * Add comment
parent
9080607b
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
87 additions
and
17 deletions
+87
-17
examples/pytorch/language-modeling/run_clm.py
examples/pytorch/language-modeling/run_clm.py
+5
-1
examples/pytorch/language-modeling/run_clm_no_trainer.py
examples/pytorch/language-modeling/run_clm_no_trainer.py
+5
-1
examples/pytorch/language-modeling/run_mlm.py
examples/pytorch/language-modeling/run_mlm.py
+5
-1
examples/pytorch/language-modeling/run_mlm_no_trainer.py
examples/pytorch/language-modeling/run_mlm_no_trainer.py
+5
-1
examples/pytorch/language-modeling/run_plm.py
examples/pytorch/language-modeling/run_plm.py
+5
-1
examples/pytorch/multiple-choice/run_swag_no_trainer.py
examples/pytorch/multiple-choice/run_swag_no_trainer.py
+5
-1
examples/pytorch/question-answering/run_seq2seq_qa.py
examples/pytorch/question-answering/run_seq2seq_qa.py
+5
-1
examples/pytorch/summarization/run_summarization.py
examples/pytorch/summarization/run_summarization.py
+5
-1
examples/pytorch/summarization/run_summarization_no_trainer.py
...les/pytorch/summarization/run_summarization_no_trainer.py
+5
-1
examples/pytorch/token-classification/run_ner_no_trainer.py
examples/pytorch/token-classification/run_ner_no_trainer.py
+7
-1
examples/pytorch/translation/run_translation.py
examples/pytorch/translation/run_translation.py
+5
-1
examples/pytorch/translation/run_translation_no_trainer.py
examples/pytorch/translation/run_translation_no_trainer.py
+5
-1
examples/tensorflow/language-modeling/run_clm.py
examples/tensorflow/language-modeling/run_clm.py
+5
-1
examples/tensorflow/language-modeling/run_mlm.py
examples/tensorflow/language-modeling/run_mlm.py
+5
-1
examples/tensorflow/summarization/run_summarization.py
examples/tensorflow/summarization/run_summarization.py
+5
-1
examples/tensorflow/token-classification/run_ner.py
examples/tensorflow/token-classification/run_ner.py
+5
-1
examples/tensorflow/translation/run_translation.py
examples/tensorflow/translation/run_translation.py
+5
-1
No files found.
examples/pytorch/language-modeling/run_clm.py
View file @
06886d5a
...
@@ -387,7 +387,11 @@ def main():
...
@@ -387,7 +387,11 @@ def main():
n_params
=
sum
(
dict
((
p
.
data_ptr
(),
p
.
numel
())
for
p
in
model
.
parameters
()).
values
())
n_params
=
sum
(
dict
((
p
.
data_ptr
(),
p
.
numel
())
for
p
in
model
.
parameters
()).
values
())
logger
.
info
(
f
"Training new model from scratch - Total size=
{
n_params
/
2
**
20
:.
2
f
}
M params"
)
logger
.
info
(
f
"Training new model from scratch - Total size=
{
n_params
/
2
**
20
:.
2
f
}
M params"
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/language-modeling/run_clm_no_trainer.py
View file @
06886d5a
...
@@ -378,7 +378,11 @@ def main():
...
@@ -378,7 +378,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForCausalLM
.
from_config
(
config
)
model
=
AutoModelForCausalLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/language-modeling/run_mlm.py
View file @
06886d5a
...
@@ -389,7 +389,11 @@ def main():
...
@@ -389,7 +389,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForMaskedLM
.
from_config
(
config
)
model
=
AutoModelForMaskedLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/language-modeling/run_mlm_no_trainer.py
View file @
06886d5a
...
@@ -383,7 +383,11 @@ def main():
...
@@ -383,7 +383,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForMaskedLM
.
from_config
(
config
)
model
=
AutoModelForMaskedLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/language-modeling/run_plm.py
View file @
06886d5a
...
@@ -376,7 +376,11 @@ def main():
...
@@ -376,7 +376,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
XLNetLMHeadModel
(
config
)
model
=
XLNetLMHeadModel
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/multiple-choice/run_swag_no_trainer.py
View file @
06886d5a
...
@@ -398,7 +398,11 @@ def main():
...
@@ -398,7 +398,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForMultipleChoice
.
from_config
(
config
)
model
=
AutoModelForMultipleChoice
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Preprocessing the datasets.
# Preprocessing the datasets.
# First we tokenize all the texts.
# First we tokenize all the texts.
...
...
examples/pytorch/question-answering/run_seq2seq_qa.py
View file @
06886d5a
...
@@ -380,7 +380,11 @@ def main():
...
@@ -380,7 +380,11 @@ def main():
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
model
.
config
.
decoder_start_token_id
is
None
:
if
model
.
config
.
decoder_start_token_id
is
None
:
raise
ValueError
(
"Make sure that `config.decoder_start_token_id` is correctly defined"
)
raise
ValueError
(
"Make sure that `config.decoder_start_token_id` is correctly defined"
)
...
...
examples/pytorch/summarization/run_summarization.py
View file @
06886d5a
...
@@ -422,7 +422,11 @@ def main():
...
@@ -422,7 +422,11 @@ def main():
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
if
isinstance
(
tokenizer
,
MBartTokenizer
):
if
isinstance
(
tokenizer
,
MBartTokenizer
):
...
...
examples/pytorch/summarization/run_summarization_no_trainer.py
View file @
06886d5a
...
@@ -439,7 +439,11 @@ def main():
...
@@ -439,7 +439,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForSeq2SeqLM
.
from_config
(
config
)
model
=
AutoModelForSeq2SeqLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
model
.
config
.
decoder_start_token_id
is
None
:
if
model
.
config
.
decoder_start_token_id
is
None
:
raise
ValueError
(
"Make sure that `config.decoder_start_token_id` is correctly defined"
)
raise
ValueError
(
"Make sure that `config.decoder_start_token_id` is correctly defined"
)
...
...
examples/pytorch/token-classification/run_ner_no_trainer.py
View file @
06886d5a
...
@@ -414,7 +414,13 @@ def main():
...
@@ -414,7 +414,13 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForTokenClassification
.
from_config
(
config
)
model
=
AutoModelForTokenClassification
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Model has labels -> use them.
# Model has labels -> use them.
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
if
model
.
config
.
label2id
!=
PretrainedConfig
(
num_labels
=
num_labels
).
label2id
:
...
...
examples/pytorch/translation/run_translation.py
View file @
06886d5a
...
@@ -380,7 +380,11 @@ def main():
...
@@ -380,7 +380,11 @@ def main():
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Set decoder_start_token_id
# Set decoder_start_token_id
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
...
...
examples/pytorch/translation/run_translation_no_trainer.py
View file @
06886d5a
...
@@ -411,7 +411,11 @@ def main():
...
@@ -411,7 +411,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
AutoModelForSeq2SeqLM
.
from_config
(
config
)
model
=
AutoModelForSeq2SeqLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# Set decoder_start_token_id
# Set decoder_start_token_id
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
if
model
.
config
.
decoder_start_token_id
is
None
and
isinstance
(
tokenizer
,
(
MBartTokenizer
,
MBartTokenizerFast
)):
...
...
examples/tensorflow/language-modeling/run_clm.py
View file @
06886d5a
...
@@ -473,7 +473,11 @@ def main():
...
@@ -473,7 +473,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
TFAutoModelForCausalLM
.
from_config
(
config
)
model
=
TFAutoModelForCausalLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# endregion
# endregion
# region TF Dataset preparation
# region TF Dataset preparation
...
...
examples/tensorflow/language-modeling/run_mlm.py
View file @
06886d5a
...
@@ -489,7 +489,11 @@ def main():
...
@@ -489,7 +489,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
TFAutoModelForMaskedLM
.
from_config
(
config
)
model
=
TFAutoModelForMaskedLM
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# endregion
# endregion
# region TF Dataset preparation
# region TF Dataset preparation
...
...
examples/tensorflow/summarization/run_summarization.py
View file @
06886d5a
...
@@ -516,7 +516,11 @@ def main():
...
@@ -516,7 +516,11 @@ def main():
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# endregion
# endregion
# region Prepare TF Dataset objects
# region Prepare TF Dataset objects
...
...
examples/tensorflow/token-classification/run_ner.py
View file @
06886d5a
...
@@ -385,7 +385,11 @@ def main():
...
@@ -385,7 +385,11 @@ def main():
logger
.
info
(
"Training new model from scratch"
)
logger
.
info
(
"Training new model from scratch"
)
model
=
TFAutoModelForTokenClassification
.
from_config
(
config
)
model
=
TFAutoModelForTokenClassification
.
from_config
(
config
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# endregion
# endregion
# region Create TF datasets
# region Create TF datasets
...
...
examples/tensorflow/translation/run_translation.py
View file @
06886d5a
...
@@ -469,7 +469,11 @@ def main():
...
@@ -469,7 +469,11 @@ def main():
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
embedding_size
=
model
.
get_input_embeddings
().
weight
.
shape
[
0
]
if
len
(
tokenizer
)
>
embedding_size
:
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
isinstance
(
tokenizer
,
tuple
(
MULTILINGUAL_TOKENIZERS
)):
if
isinstance
(
tokenizer
,
tuple
(
MULTILINGUAL_TOKENIZERS
)):
model
.
config
.
forced_bos_token_id
=
forced_bos_token_id
model
.
config
.
forced_bos_token_id
=
forced_bos_token_id
# endregion
# endregion
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment