Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
d340e232
Commit
d340e232
authored
Sep 24, 2019
by
LysandreJik
Browse files
create_mask_from_sequences -> create_token_type_ids_from_sequences
parent
c832f43a
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
6 additions
and
17 deletions
+6
-17
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+1
-1
pytorch_transformers/tokenization_distilbert.py
pytorch_transformers/tokenization_distilbert.py
+0
-11
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+1
-1
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+2
-2
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+1
-1
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+1
-1
No files found.
pytorch_transformers/tokenization_bert.py
View file @
d340e232
...
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
create_
mask
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
def
create_
token_type_ids
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
A BERT sequence pair mask has the following format:
...
...
pytorch_transformers/tokenization_distilbert.py
View file @
d340e232
...
@@ -67,14 +67,3 @@ class DistilBertTokenizer(BertTokenizer):
...
@@ -67,14 +67,3 @@ class DistilBertTokenizer(BertTokenizer):
def
add_special_tokens_sequence_pair
(
self
,
token_ids_0
,
token_ids_1
):
def
add_special_tokens_sequence_pair
(
self
,
token_ids_0
,
token_ids_1
):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
return
token_ids_0
+
sep
+
token_ids_1
def
create_mask_from_sequences
(
self
,
sequence_0
,
sequence_1
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep
=
[
self
.
sep_token_id
]
return
len
(
self
.
encode
(
sequence_0
)
+
sep
)
*
[
0
]
+
len
(
self
.
encode
(
sequence_1
))
*
[
1
]
pytorch_transformers/tokenization_roberta.py
View file @
d340e232
...
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
...
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
create_
mask
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
def
create_
token_type_ids
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
A RoBERTa sequence pair mask has the following format:
...
...
pytorch_transformers/tokenization_utils.py
View file @
d340e232
...
@@ -780,7 +780,7 @@ class PreTrainedTokenizer(object):
...
@@ -780,7 +780,7 @@ class PreTrainedTokenizer(object):
)
)
if
output_token_type
:
if
output_token_type
:
information
[
"token_type_ids"
]
=
self
.
create_
mask
_from_sequences
(
text
,
text_pair
)
information
[
"token_type_ids"
]
=
self
.
create_
token_type_ids
_from_sequences
(
text
,
text_pair
)
else
:
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
sequence
=
first_sentence_tokens
+
second_sentence_tokens
sequence
=
first_sentence_tokens
+
second_sentence_tokens
...
@@ -863,7 +863,7 @@ class PreTrainedTokenizer(object):
...
@@ -863,7 +863,7 @@ class PreTrainedTokenizer(object):
return
information
return
information
def
create_
mask
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
def
create_
token_type_ids
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
return
[
0
]
*
len
(
self
.
encode
(
sequence_0
))
+
[
1
]
*
len
(
self
.
encode
(
sequence_1
))
return
[
0
]
*
len
(
self
.
encode
(
sequence_0
))
+
[
1
]
*
len
(
self
.
encode
(
sequence_1
))
...
...
pytorch_transformers/tokenization_xlm.py
View file @
d340e232
...
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
create_
mask
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
def
create_
token_type_ids
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
An XLM sequence pair mask has the following format:
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
d340e232
...
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
create_
mask
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
def
create_
token_type_ids
_from_sequences
(
self
,
sequence_0
,
sequence_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
A BERT sequence pair mask has the following format:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment