Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
e391d473
Commit
e391d473
authored
Sep 02, 2019
by
LysandreJik
Browse files
Tokenizers' encode function can output binary masks
parent
0d1dad6d
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
39 additions
and
11 deletions
+39
-11
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+8
-2
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+8
-2
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+5
-3
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+9
-2
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+9
-2
No files found.
pytorch_transformers/tokenization_bert.py
View file @
e391d473
...
...
@@ -194,14 +194,20 @@ class BertTokenizer(PreTrainedTokenizer):
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
vocab_path
):
"""Save the tokenizer vocabulary to a directory or file."""
...
...
pytorch_transformers/tokenization_roberta.py
View file @
e391d473
...
...
@@ -88,11 +88,17 @@ class RobertaTokenizer(GPT2Tokenizer):
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
sep
+
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
pytorch_transformers/tokenization_utils.py
View file @
e391d473
...
...
@@ -663,7 +663,7 @@ class PreTrainedTokenizer(object):
def
_convert_token_to_id
(
self
,
token
):
raise
NotImplementedError
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
False
,
**
kwargs
):
def
encode
(
self
,
text
,
text_pair
=
None
,
add_special_tokens
=
False
,
output_mask
=
False
,
**
kwargs
):
"""
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
...
...
@@ -674,6 +674,8 @@ class PreTrainedTokenizer(object):
text_pair: Optional second sequence to be encoded.
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
output_mask: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
and 1 for the second.
**kwargs: passed to the `self.tokenize()` method
"""
if
text_pair
is
None
:
...
...
@@ -686,7 +688,7 @@ class PreTrainedTokenizer(object):
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
,
**
kwargs
)]
if
add_special_tokens
:
return
self
.
add_special_tokens_sentences_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
return
self
.
add_special_tokens_sentences_pair
(
first_sentence_tokens
,
second_sentence_tokens
,
output_mask
)
else
:
return
first_sentence_tokens
,
second_sentence_tokens
...
...
@@ -694,7 +696,7 @@ class PreTrainedTokenizer(object):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
return
token_ids
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
...
...
pytorch_transformers/tokenization_xlm.py
View file @
e391d473
...
...
@@ -761,14 +761,21 @@ class XLMTokenizer(PreTrainedTokenizer):
"""
return
[
self
.
cls_token_id
]
+
token_ids
+
[
self
.
sep_token_id
]
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
"""
Adds special tokens to a sequence pair for sequence classification tasks.
An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
if
output_mask
:
return
(
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
,
[
0
]
*
len
(
cls
+
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
)
)
else
:
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
e391d473
...
...
@@ -190,14 +190,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
token_ids
+
sep
+
cls
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sentences_pair
(
self
,
token_ids_0
,
token_ids_1
,
output_mask
=
False
):
"""
Adds special tokens to a sequence for sequence classification tasks.
An XLNet sequence has the following format: X [SEP][CLS]
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
if
output_mask
:
return
(
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
,
[
0
]
*
len
(
token_ids_0
+
sep
)
+
[
1
]
*
len
(
token_ids_1
+
sep
+
cls
)
)
else
:
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment