Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
b0f05e0c
Unverified
Commit
b0f05e0c
authored
Oct 09, 2020
by
Stas Bekman
Committed by
GitHub
Oct 09, 2020
Browse files
[pegasus] Faster tokenizer tests (#7672)
parent
bc00b37a
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
51 additions
and
29 deletions
+51
-29
scripts/pegasus/build_test_sample_spm_no_bos.py
scripts/pegasus/build_test_sample_spm_no_bos.py
+20
-0
src/transformers/testing_utils.py
src/transformers/testing_utils.py
+13
-3
src/transformers/tokenization_pegasus.py
src/transformers/tokenization_pegasus.py
+5
-10
src/transformers/tokenization_reformer.py
src/transformers/tokenization_reformer.py
+2
-1
src/transformers/tokenization_utils.py
src/transformers/tokenization_utils.py
+1
-1
tests/fixtures/test_sentencepiece_no_bos.model
tests/fixtures/test_sentencepiece_no_bos.model
+0
-0
tests/test_tokenization_pegasus.py
tests/test_tokenization_pegasus.py
+8
-11
tests/test_tokenization_t5.py
tests/test_tokenization_t5.py
+2
-3
No files found.
scripts/pegasus/build_test_sample_spm_no_bos.py
0 → 100755
View file @
b0f05e0c
#!/usr/bin/env python
# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
# 1. pip install sentencepiece
#
# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
# 3. build
import
sentencepiece
as
spm
# pegasus:
# 1. no bos
# 2. eos_id is 1
# 3. unk_id is 2
# build a sample spm file accordingly
spm
.
SentencePieceTrainer
.
train
(
'--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2 --eos_id=1 --vocab_size=1000'
)
# 4. now update the fixture
# mv test_sentencepiece_no_bos.model ../../tests/fixtures/
src/transformers/testing_utils.py
View file @
b0f05e0c
...
...
@@ -184,13 +184,23 @@ def require_faiss(test_case):
return
test_case
def
get_tests_dir
():
def
get_tests_dir
(
append_path
=
None
):
"""
returns the full path to the `tests` dir, so that the tests can be invoked from anywhere
Args:
append_path: optional path to append to the tests dir path
Return:
The full path to the `tests` dir, so that the tests can be invoked from anywhere.
Optionally `append_path` is joined after the `tests` dir the former is provided.
"""
# this function caller's __file__
caller__file__
=
inspect
.
stack
()[
1
][
1
]
return
os
.
path
.
abspath
(
os
.
path
.
dirname
(
caller__file__
))
tests_dir
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
caller__file__
))
if
append_path
:
return
os
.
path
.
join
(
tests_dir
,
append_path
)
else
:
return
tests_dir
#
...
...
src/transformers/tokenization_pegasus.py
View file @
b0f05e0c
...
...
@@ -49,7 +49,7 @@ class PegasusTokenizer(ReformerTokenizer):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
# Dont use reserved words added_token_encoder, added_tokens_decoder because of
# Don
'
t use reserved words added_token_encoder, added_tokens_decoder because of
# AssertionError: Non-consecutive added token '1' found. in from_pretrained
assert
len
(
self
.
added_tokens_decoder
)
==
0
self
.
encoder
:
Dict
[
int
,
str
]
=
{
0
:
self
.
pad_token
,
1
:
self
.
eos_token
}
...
...
@@ -58,7 +58,7 @@ class PegasusTokenizer(ReformerTokenizer):
self
.
decoder
:
Dict
[
str
,
int
]
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
def
_convert_token_to_id
(
self
,
token
:
str
)
->
int
:
""" Converts a token (str)
in
an id using the vocab. """
""" Converts a token (str)
to
an id using the vocab. """
if
token
in
self
.
decoder
:
return
self
.
decoder
[
token
]
elif
token
in
self
.
added_tokens_decoder
:
...
...
@@ -67,7 +67,7 @@ class PegasusTokenizer(ReformerTokenizer):
return
sp_id
+
self
.
offset
def
_convert_id_to_token
(
self
,
index
:
int
)
->
str
:
"""Converts an index (integer)
in
a token (str) using the vocab."""
"""Converts an index (integer)
to
a token (str) using the vocab."""
if
index
in
self
.
encoder
:
return
self
.
encoder
[
index
]
elif
index
in
self
.
added_tokens_encoder
:
...
...
@@ -81,11 +81,6 @@ class PegasusTokenizer(ReformerTokenizer):
def
vocab_size
(
self
)
->
int
:
return
len
(
self
.
sp_model
)
+
self
.
offset
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]:
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
num_special_tokens_to_add
(
self
,
pair
=
False
):
"""Just EOS"""
return
1
...
...
@@ -109,12 +104,12 @@ class PegasusTokenizer(ReformerTokenizer):
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
,
token_ids_1
=
None
)
->
List
[
int
]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
Build model inputs from a sequence or a pair of sequence
s
for sequence classification tasks
by concatenating and adding special tokens.
A Pegasus sequence has the following format, where ``X`` represents the sequence:
- single sequence: ``X </s>``
- pair of sequences: ``A B </s>``
(not intended use)
- pair of sequences: ``A B </s>`` (not intended use)
BOS is never used.
Pairs of sequences are not the expected use case, but they will be handled without a separator.
...
...
src/transformers/tokenization_reformer.py
View file @
b0f05e0c
...
...
@@ -17,6 +17,7 @@
import
os
from
shutil
import
copyfile
from
typing
import
Dict
from
.tokenization_utils
import
PreTrainedTokenizer
from
.tokenization_utils_fast
import
PreTrainedTokenizerFast
...
...
@@ -119,7 +120,7 @@ class ReformerTokenizer(PreTrainedTokenizer):
def
vocab_size
(
self
):
return
self
.
sp_model
.
get_piece_size
()
def
get_vocab
(
self
):
def
get_vocab
(
self
)
->
Dict
[
str
,
int
]
:
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
...
...
src/transformers/tokenization_utils.py
View file @
b0f05e0c
...
...
@@ -186,7 +186,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
print('We have added', num_added_toks, 'tokens')
# Not
ic
e: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
# Note: resize_token_embeddings expect
s
to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
model.resize_token_embeddings(len(tokenizer))
"""
new_tokens
=
[
str
(
tok
)
for
tok
in
new_tokens
]
...
...
tests/fixtures/test_sentencepiece_no_bos.model
0 → 100644
View file @
b0f05e0c
File added
tests/test_tokenization_pegasus.py
View file @
b0f05e0c
import
unittest
from
pathlib
import
Path
from
transformers.file_utils
import
cached_property
from
transformers.testing_utils
import
require_torch
from
transformers.testing_utils
import
get_tests_dir
,
require_torch
from
transformers.tokenization_pegasus
import
PegasusTokenizer
,
PegasusTokenizerFast
from
.test_tokenization_common
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
get_tests_dir
(
"fixtures/test_sentencepiece_no_bos.model"
)
class
PegasusTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
PegasusTokenizer
...
...
@@ -17,11 +19,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
def
setUp
(
self
):
super
().
setUp
()
save_dir
=
Path
(
self
.
tmpdirname
)
spm_file
=
PegasusTokenizer
.
vocab_files_names
[
"vocab_file"
]
if
not
(
save_dir
/
spm_file
).
exists
():
tokenizer
=
self
.
pegasus_large_tokenizer
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
# We have a SentencePiece fixture for testing
tokenizer
=
PegasusTokenizer
(
SAMPLE_VOCAB
)
tokenizer
.
save_pretrained
(
self
.
tmpdirname
)
@
cached_property
def
pegasus_large_tokenizer
(
self
):
...
...
@@ -32,10 +32,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
pass
def
get_tokenizer
(
self
,
**
kwargs
)
->
PegasusTokenizer
:
if
not
kwargs
:
return
self
.
pegasus_large_tokenizer
else
:
return
PegasusTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
return
PegasusTokenizer
.
from_pretrained
(
self
.
tmpdirname
,
**
kwargs
)
def
get_input_output_texts
(
self
,
tokenizer
):
return
(
"This is a test"
,
"This is a test"
)
...
...
tests/test_tokenization_t5.py
View file @
b0f05e0c
...
...
@@ -14,19 +14,18 @@
# limitations under the License.
import
os
import
unittest
from
transformers
import
BatchEncoding
from
transformers.file_utils
import
cached_property
from
transformers.testing_utils
import
_torch_available
from
transformers.testing_utils
import
_torch_available
,
get_tests_dir
from
transformers.tokenization_t5
import
T5Tokenizer
,
T5TokenizerFast
from
transformers.tokenization_xlnet
import
SPIECE_UNDERLINE
from
.test_tokenization_common
import
TokenizerTesterMixin
SAMPLE_VOCAB
=
os
.
path
.
join
(
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)),
"fixtures/test_sentencepiece.model"
)
SAMPLE_VOCAB
=
get_tests_dir
(
"fixtures/test_sentencepiece.model"
)
FRAMEWORK
=
"pt"
if
_torch_available
else
"tf"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment