Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7d709e55
Commit
7d709e55
authored
Oct 22, 2019
by
Lysandre
Browse files
Remove
parent
44286b94
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
41 additions
and
39 deletions
+41
-39
examples/benchmarks.py
examples/benchmarks.py
+2
-2
examples/distillation/scripts/binarized_data.py
examples/distillation/scripts/binarized_data.py
+1
-1
examples/run_generation.py
examples/run_generation.py
+1
-1
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+2
-2
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+2
-2
transformers/tests/tokenization_roberta_test.py
transformers/tests/tokenization_roberta_test.py
+4
-4
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+15
-13
transformers/tests/tokenization_xlm_test.py
transformers/tests/tokenization_xlm_test.py
+2
-2
transformers/tests/tokenization_xlnet_test.py
transformers/tests/tokenization_xlnet_test.py
+2
-2
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+10
-10
No files found.
examples/benchmarks.py
View file @
7d709e55
...
...
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
model
=
AutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
...
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
model
=
TFAutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
...
examples/distillation/scripts/binarized_data.py
View file @
7d709e55
...
...
@@ -68,7 +68,7 @@ def main():
start
=
time
.
time
()
for
text
in
data
:
text
=
f
'
{
bos
}
{
text
.
strip
()
}
{
sep
}
'
token_ids
=
tokenizer
.
encode
(
text
)
token_ids
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
rslt
.
append
(
token_ids
)
iter
+=
1
...
...
examples/run_generation.py
View file @
7d709e55
...
...
@@ -223,7 +223,7 @@ def main():
if
args
.
model_type
in
[
"transfo-xl"
,
"xlnet"
]:
# Models with memory likes to have a long prompt for short inputs.
raw_text
=
(
args
.
padding_text
if
args
.
padding_text
else
PADDING_TEXT
)
+
raw_text
context_tokens
=
tokenizer
.
encode
(
raw_text
)
context_tokens
=
tokenizer
.
encode
(
raw_text
,
add_special_tokens
=
False
)
out
=
sample_sequence
(
model
=
model
,
context
=
context_tokens
,
...
...
transformers/tests/tokenization_bert_test.py
View file @
7d709e55
...
...
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_distilbert_test.py
View file @
7d709e55
...
...
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_roberta_test.py
View file @
7d709e55
...
...
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
tokenizer
.
encode
(
'Hello world!'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
2
]
)
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
),
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
def
test_sequence_builders
(
self
):
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
...
...
transformers/tests/tokenization_tests_commons.py
View file @
7d709e55
...
...
@@ -79,13 +79,13 @@ class CommonTestCases:
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
...
...
@@ -130,7 +130,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
...
...
@@ -148,7 +148,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
...
...
@@ -166,7 +167,7 @@ class CommonTestCases:
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
...
...
@@ -206,7 +207,7 @@ class CommonTestCases:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
)
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
...
...
@@ -219,7 +220,7 @@ class CommonTestCases:
seq_0
=
"This is a sentence to be encoded."
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
...
...
@@ -239,13 +240,13 @@ class CommonTestCases:
seq_1
=
"This is another sentence to be encoded."
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
...
...
@@ -283,7 +284,7 @@ class CommonTestCases:
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
@@ -294,7 +295,8 @@ class CommonTestCases:
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
transformers/tests/tokenization_xlm_test.py
View file @
7d709e55
...
...
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_xlnet_test.py
View file @
7d709e55
...
...
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tokenization_utils.py
View file @
7d709e55
...
...
@@ -691,7 +691,7 @@ class PreTrainedTokenizer(object):
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
...
...
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
...
...
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
truncation_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Fals
e
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Tru
e
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment