Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
7d709e55
"git@developer.sourcefind.cn:tianlh/baichuan-pytorch.git" did not exist on "0c7dc007d4d2e0dcf302dbca0baf39e36d5d85fc"
Commit
7d709e55
authored
Oct 22, 2019
by
Lysandre
Browse files
Remove
parent
44286b94
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
41 additions
and
39 deletions
+41
-39
examples/benchmarks.py
examples/benchmarks.py
+2
-2
examples/distillation/scripts/binarized_data.py
examples/distillation/scripts/binarized_data.py
+1
-1
examples/run_generation.py
examples/run_generation.py
+1
-1
transformers/tests/tokenization_bert_test.py
transformers/tests/tokenization_bert_test.py
+2
-2
transformers/tests/tokenization_distilbert_test.py
transformers/tests/tokenization_distilbert_test.py
+2
-2
transformers/tests/tokenization_roberta_test.py
transformers/tests/tokenization_roberta_test.py
+4
-4
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+15
-13
transformers/tests/tokenization_xlm_test.py
transformers/tests/tokenization_xlm_test.py
+2
-2
transformers/tests/tokenization_xlnet_test.py
transformers/tests/tokenization_xlnet_test.py
+2
-2
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+10
-10
No files found.
examples/benchmarks.py
View file @
7d709e55
...
...
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
model
=
AutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
...
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
model
=
TFAutoModel
.
from_pretrained
(
model_name
,
config
=
config
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
)
tokenized_sequence
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
max_input_size
=
tokenizer
.
max_model_input_sizes
[
model_name
]
batch_sizes
=
[
1
,
2
,
4
,
8
]
...
...
examples/distillation/scripts/binarized_data.py
View file @
7d709e55
...
...
@@ -68,7 +68,7 @@ def main():
start
=
time
.
time
()
for
text
in
data
:
text
=
f
'
{
bos
}
{
text
.
strip
()
}
{
sep
}
'
token_ids
=
tokenizer
.
encode
(
text
)
token_ids
=
tokenizer
.
encode
(
text
,
add_special_tokens
=
False
)
rslt
.
append
(
token_ids
)
iter
+=
1
...
...
examples/run_generation.py
View file @
7d709e55
...
...
@@ -223,7 +223,7 @@ def main():
if
args
.
model_type
in
[
"transfo-xl"
,
"xlnet"
]:
# Models with memory likes to have a long prompt for short inputs.
raw_text
=
(
args
.
padding_text
if
args
.
padding_text
else
PADDING_TEXT
)
+
raw_text
context_tokens
=
tokenizer
.
encode
(
raw_text
)
context_tokens
=
tokenizer
.
encode
(
raw_text
,
add_special_tokens
=
False
)
out
=
sample_sequence
(
model
=
model
,
context
=
context_tokens
,
...
...
transformers/tests/tokenization_bert_test.py
View file @
7d709e55
...
...
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_distilbert_test.py
View file @
7d709e55
...
...
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
def
test_sequence_builders
(
self
):
tokenizer
=
DistilBertTokenizer
.
from_pretrained
(
"distilbert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_roberta_test.py
View file @
7d709e55
...
...
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer
=
self
.
get_tokenizer
()
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world!'
),
tokenizer
.
encode
(
'Hello world!'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
2
]
)
self
.
assertListEqual
(
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
),
tokenizer
.
encode
(
'Hello world! cécé herlolip 418'
,
add_special_tokens
=
False
),
[
0
,
31414
,
232
,
328
,
740
,
1140
,
12695
,
69
,
46078
,
1588
,
2
]
)
def
test_sequence_builders
(
self
):
tokenizer
=
RobertaTokenizer
.
from_pretrained
(
"roberta-base"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
...
...
transformers/tests/tokenization_tests_commons.py
View file @
7d709e55
...
...
@@ -79,13 +79,13 @@ class CommonTestCases:
# Now let's start the test
tokenizer
=
self
.
get_tokenizer
(
max_len
=
42
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
before_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
with
TemporaryDirectory
()
as
tmpdirname
:
tokenizer
.
save_pretrained
(
tmpdirname
)
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
tmpdirname
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
)
after_tokens
=
tokenizer
.
encode
(
u
"He is very happy, UNwant
\u00E9
d,running"
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
before_tokens
,
after_tokens
)
self
.
assertEqual
(
tokenizer
.
max_len
,
42
)
...
...
@@ -130,7 +130,7 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks
,
len
(
new_toks
))
self
.
assertEqual
(
all_size_2
,
all_size
+
len
(
new_toks
))
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
)
tokens
=
tokenizer
.
encode
(
"aaaaa bbbbbb low cccccccccdddddddd l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
4
)
...
...
@@ -148,7 +148,8 @@ class CommonTestCases:
self
.
assertEqual
(
added_toks_2
,
len
(
new_toks_2
))
self
.
assertEqual
(
all_size_3
,
all_size_2
+
len
(
new_toks_2
))
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
)
tokens
=
tokenizer
.
encode
(
">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l"
,
add_special_tokens
=
False
)
out_string
=
tokenizer
.
decode
(
tokens
)
self
.
assertGreaterEqual
(
len
(
tokens
),
6
)
...
...
@@ -166,7 +167,7 @@ class CommonTestCases:
tokens
=
tokenizer
.
tokenize
(
input_text
)
ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
ids_2
=
tokenizer
.
encode
(
input_text
)
ids_2
=
tokenizer
.
encode
(
input_text
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
ids_2
)
tokens_2
=
tokenizer
.
convert_ids_to_tokens
(
ids
)
...
...
@@ -206,7 +207,7 @@ class CommonTestCases:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
)
sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
False
)
attached_sequences
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
# Method is implemented (e.g. not GPT-2)
...
...
@@ -219,7 +220,7 @@ class CommonTestCases:
seq_0
=
"This is a sentence to be encoded."
stride
=
2
sequence
=
tokenizer
.
encode
(
seq_0
)
sequence
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
num_added_tokens
=
tokenizer
.
num_added_tokens
()
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
...
...
@@ -239,13 +240,13 @@ class CommonTestCases:
seq_1
=
"This is another sentence to be encoded."
stride
=
2
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
)
sequence_0_no_special_tokens
=
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
)
sequence_1_no_special_tokens
=
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
build_inputs_with_special_tokens
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
tokenizer
.
encode
(
seq_0
,
add_special_tokens
=
False
),
tokenizer
.
encode
(
seq_1
,
add_special_tokens
=
False
)[:
-
2
]
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
...
...
@@ -283,7 +284,7 @@ class CommonTestCases:
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
@@ -294,7 +295,8 @@ class CommonTestCases:
self
.
assertEqual
(
encoded_sequence
,
filtered_sequence
)
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
,
add_special_tokens
=
False
)
+
tokenizer
.
encode
(
sequence_1
,
add_special_tokens
=
False
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
special_tokens_mask
=
encoded_sequence_dict
[
"special_tokens_mask"
]
...
...
transformers/tests/tokenization_xlm_test.py
View file @
7d709e55
...
...
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
XLMTokenizer
.
from_pretrained
(
"xlm-mlm-en-2048"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tests/tokenization_xlnet_test.py
View file @
7d709e55
...
...
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
def
test_sequence_builders
(
self
):
tokenizer
=
XLNetTokenizer
.
from_pretrained
(
"xlnet-base-cased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
...
...
transformers/tokenization_utils.py
View file @
7d709e55
...
...
@@ -691,7 +691,7 @@ class PreTrainedTokenizer(object):
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
...
...
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
def
encode_plus
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
Fals
e
,
add_special_tokens
=
Tru
e
,
max_length
=
None
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
...
...
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
truncation_strategy
=
truncation_strategy
,
return_tensors
=
return_tensors
)
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Fals
e
,
stride
=
0
,
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
Tru
e
,
stride
=
0
,
truncation_strategy
=
'longest_first'
,
return_tensors
=
None
):
"""
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment