Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
bf503158
Commit
bf503158
authored
Sep 19, 2019
by
LysandreJik
Browse files
Sentence -> Sequence. Removed output_mask from the special token addition methods.
parent
8cba0572
Changes
13
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
49 additions
and
76 deletions
+49
-76
examples/run_lm_finetuning.py
examples/run_lm_finetuning.py
+1
-1
pytorch_transformers/tests/tokenization_bert_test.py
pytorch_transformers/tests/tokenization_bert_test.py
+2
-2
pytorch_transformers/tests/tokenization_distilbert_test.py
pytorch_transformers/tests/tokenization_distilbert_test.py
+2
-2
pytorch_transformers/tests/tokenization_roberta_test.py
pytorch_transformers/tests/tokenization_roberta_test.py
+2
-2
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+14
-14
pytorch_transformers/tests/tokenization_xlm_test.py
pytorch_transformers/tests/tokenization_xlm_test.py
+2
-2
pytorch_transformers/tests/tokenization_xlnet_test.py
pytorch_transformers/tests/tokenization_xlnet_test.py
+2
-2
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+4
-9
pytorch_transformers/tokenization_distilbert.py
pytorch_transformers/tokenization_distilbert.py
+2
-2
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+3
-9
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+9
-12
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+3
-10
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+3
-9
No files found.
examples/run_lm_finetuning.py
View file @
bf503158
...
...
@@ -75,7 +75,7 @@ class TextDataset(Dataset):
tokenized_text
=
tokenizer
.
convert_tokens_to_ids
(
tokenizer
.
tokenize
(
text
))
while
len
(
tokenized_text
)
>=
block_size
:
# Truncate in block of block_size
self
.
examples
.
append
(
tokenizer
.
add_special_tokens_single_se
nt
ence
(
tokenized_text
[:
block_size
]))
self
.
examples
.
append
(
tokenizer
.
add_special_tokens_single_se
qu
ence
(
tokenized_text
[:
block_size
]))
tokenized_text
=
tokenized_text
[
block_size
:]
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
# If your dataset is small, first you should loook for a bigger one :-) and second you
...
...
pytorch_transformers/tests/tokenization_bert_test.py
View file @
bf503158
...
...
@@ -131,8 +131,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
nt
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
text
,
text_2
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
qu
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
text
,
text_2
)
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
...
...
pytorch_transformers/tests/tokenization_distilbert_test.py
View file @
bf503158
...
...
@@ -36,8 +36,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
nt
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
text
,
text_2
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
qu
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
text
,
text_2
)
assert
encoded_sentence
==
text
assert
encoded_pair
==
text
+
[
102
]
+
text_2
...
...
pytorch_transformers/tests/tokenization_roberta_test.py
View file @
bf503158
...
...
@@ -87,8 +87,8 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
encoded_text_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
True
)
encoded_pair_from_decode
=
tokenizer
.
encode
(
"sequence builders"
,
"multi-sequence build"
,
add_special_tokens
=
True
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
nt
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
text
,
text_2
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
qu
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
text
,
text_2
)
assert
encoded_sentence
==
encoded_text_from_decode
assert
encoded_pair
==
encoded_pair_from_decode
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
bf503158
...
...
@@ -187,18 +187,18 @@ class CommonTestCases:
for
weights_list_2
in
weights_lists_2
:
self
.
assertListEqual
(
weights_list
,
weights_list_2
)
def
test_mask_output
(
self
):
if
sys
.
version_info
<=
(
3
,
0
):
return
tokenizer
=
self
.
get_tokenizer
()
if
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_mask
=
True
)
sequences
,
mask
=
information
[
"sequence"
],
information
[
"mask"
]
assert
len
(
sequences
)
==
len
(
mask
)
#
def test_mask_output(self):
#
if sys.version_info <= (3, 0):
#
return
#
#
tokenizer = self.get_tokenizer()
#
#
if tokenizer.add_special_tokens_se
qu
ence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
#
seq_0 = "Test this method."
#
seq_1 = "With these inputs."
#
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
#
sequences, mask = information["sequence"], information["mask"]
#
assert len(sequences) == len(mask)
def
test_number_of_added_tokens
(
self
):
tokenizer
=
self
.
get_tokenizer
()
...
...
@@ -228,7 +228,7 @@ class CommonTestCases:
assert
len
(
overflowing_tokens
)
==
2
assert
len
(
truncated_sequence
)
==
total_length
-
2
assert
truncated_sequence
==
tokenizer
.
add_special_tokens_single_se
nt
ence
(
sequence
[:
-
2
])
assert
truncated_sequence
==
tokenizer
.
add_special_tokens_single_se
qu
ence
(
sequence
[:
-
2
])
def
test_maximum_encoding_length_pair_input
(
self
):
tokenizer
=
self
.
get_tokenizer
()
...
...
@@ -237,7 +237,7 @@ class CommonTestCases:
seq_1
=
"This is another sentence to be encoded."
sequence
=
tokenizer
.
encode
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
truncated_second_sequence
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
truncated_second_sequence
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
tokenizer
.
encode
(
seq_0
),
tokenizer
.
encode
(
seq_1
)[:
-
2
]
)
...
...
pytorch_transformers/tests/tokenization_xlm_test.py
View file @
bf503158
...
...
@@ -72,8 +72,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
nt
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
text
,
text_2
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
qu
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
text
,
text_2
)
assert
encoded_sentence
==
[
1
]
+
text
+
[
1
]
assert
encoded_pair
==
[
1
]
+
text
+
[
1
]
+
text_2
+
[
1
]
...
...
pytorch_transformers/tests/tokenization_xlnet_test.py
View file @
bf503158
...
...
@@ -95,8 +95,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
text
=
tokenizer
.
encode
(
"sequence builders"
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
nt
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
nt
ence
s
_pair
(
text
,
text_2
)
encoded_sentence
=
tokenizer
.
add_special_tokens_single_se
qu
ence
(
text
)
encoded_pair
=
tokenizer
.
add_special_tokens_se
qu
ence_pair
(
text
,
text_2
)
assert
encoded_sentence
==
text
+
[
4
,
3
]
assert
encoded_pair
==
text
+
[
4
]
+
text_2
+
[
4
,
3
]
...
...
pytorch_transformers/tokenization_bert.py
View file @
bf503158
...
...
@@ -187,26 +187,21 @@ class BertTokenizer(PreTrainedTokenizer):
out_string
=
' '
.
join
(
tokens
).
replace
(
' ##'
,
''
).
strip
()
return
out_string
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
"""
Adds special tokens to the a sequence for sequence classification tasks.
A BERT sequence has the following format: [CLS] X [SEP]
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
vocab_path
):
...
...
pytorch_transformers/tokenization_distilbert.py
View file @
bf503158
...
...
@@ -61,10 +61,10 @@ class DistilBertTokenizer(BertTokenizer):
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
return
token_ids
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
sep
=
[
self
.
sep_token_id
]
if
output_mask
:
return
(
...
...
pytorch_transformers/tokenization_roberta.py
View file @
bf503158
...
...
@@ -81,24 +81,18 @@ class RobertaTokenizer(GPT2Tokenizer):
sep_token
=
sep_token
,
cls_token
=
cls_token
,
pad_token
=
pad_token
,
mask_token
=
mask_token
,
**
kwargs
)
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
A RoBERTa sequence has the following format: <s> X </s>
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
pytorch_transformers/tokenization_utils.py
View file @
bf503158
...
...
@@ -708,7 +708,7 @@ class PreTrainedTokenizer(object):
if
text_pair
is
None
:
if
add_special_tokens
:
sequence_tokens
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
return
self
.
add_special_tokens_single_se
nt
ence
(
sequence_tokens
)
return
self
.
add_special_tokens_single_se
qu
ence
(
sequence_tokens
)
else
:
ids
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
return
ids
...
...
@@ -717,7 +717,7 @@ class PreTrainedTokenizer(object):
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
,
**
kwargs
)]
if
add_special_tokens
:
return
self
.
add_special_tokens_se
nt
ence
s
_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
return
self
.
add_special_tokens_se
qu
ence_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
return
first_sentence_tokens
+
second_sentence_tokens
...
...
@@ -747,7 +747,7 @@ class PreTrainedTokenizer(object):
if
max_length
:
information
[
"overflowing_tokens"
]
=
sequence_tokens
[
max_length
-
n_added_tokens
:]
sequence_tokens
=
sequence_tokens
[:
max_length
-
n_added_tokens
]
sequence
=
self
.
add_special_tokens_single_se
nt
ence
(
sequence_tokens
)
sequence
=
self
.
add_special_tokens_single_se
qu
ence
(
sequence_tokens
)
else
:
sequence_tokens
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
if
max_length
:
...
...
@@ -774,16 +774,13 @@ class PreTrainedTokenizer(object):
information
[
"overflowing_tokens"
]
=
second_sentence_tokens
[
max_length
-
f_len
-
n_added_tokens
:]
second_sentence_tokens
=
second_sentence_tokens
[:
max_length
-
f_len
-
n_added_tokens
]
encoded_
sequence
=
self
.
add_special_tokens_se
nt
ence
s
_pair
(
sequence
=
self
.
add_special_tokens_se
qu
ence_pair
(
first_sentence_tokens
,
second_sentence_tokens
,
output_mask
second_sentence_tokens
)
if
output_mask
:
sequence
,
information
[
"mask"
]
=
encoded_sequence
else
:
sequence
=
encoded_sequence
# if output_mask:
# sequence, information["mask"] = encoded_sequence
information
[
"sequence"
]
=
sequence
else
:
...
...
@@ -800,11 +797,11 @@ class PreTrainedTokenizer(object):
return
information
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
...
...
pytorch_transformers/tokenization_xlm.py
View file @
bf503158
...
...
@@ -754,27 +754,20 @@ class XLMTokenizer(PreTrainedTokenizer):
out_string
=
''
.
join
(
tokens
).
replace
(
'</w>'
,
' '
).
strip
()
return
out_string
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLM sequence has the following format: [CLS] X [SEP]
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
bf503158
...
...
@@ -181,7 +181,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
out_string
=
''
.
join
(
tokens
).
replace
(
SPIECE_UNDERLINE
,
' '
).
strip
()
return
out_string
def
add_special_tokens_single_se
nt
ence
(
self
,
token_ids
):
def
add_special_tokens_single_se
qu
ence
(
self
,
token_ids
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
...
...
@@ -190,7 +190,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
token_ids
+
sep
+
cls
def
add_special_tokens_se
nt
ence
s
_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
def
add_special_tokens_se
qu
ence_pair
(
self
,
token_ids_0
,
token_ids_1
):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
...
...
@@ -199,12 +199,6 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls_segment_ids
=
[
2
]
if
output_mask
:
return
(
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
,
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
+
cls_segment_ids
)
else
:
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
save_vocabulary
(
self
,
save_directory
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment