Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
2f259b22
Commit
2f259b22
authored
Sep 30, 2019
by
LysandreJik
Browse files
Sequence IDS
parent
7c789c33
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
121 additions
and
1 deletion
+121
-1
transformers/tests/tokenization_tests_commons.py
transformers/tests/tokenization_tests_commons.py
+30
-0
transformers/tokenization_bert.py
transformers/tokenization_bert.py
+18
-0
transformers/tokenization_roberta.py
transformers/tokenization_roberta.py
+18
-0
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+19
-1
transformers/tokenization_xlm.py
transformers/tokenization_xlm.py
+18
-0
transformers/tokenization_xlnet.py
transformers/tokenization_xlnet.py
+18
-0
No files found.
transformers/tests/tokenization_tests_commons.py
View file @
2f259b22
...
...
@@ -292,3 +292,33 @@ class CommonTestCases:
assert
tokenizer
.
encode
(
tokens
,
add_special_tokens
=
True
)
==
formatted_input
assert
tokenizer
.
encode
(
input_ids
,
add_special_tokens
=
True
)
==
formatted_input
def
test_sequence_ids
(
self
):
tokenizer
=
self
.
get_tokenizer
()
sequence_0
=
"Encode this."
sequence_1
=
"This one too please."
# Testing single inputs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
sequence_ids
=
encoded_sequence_dict
[
"sequence_ids"
]
assert
len
(
sequence_ids
)
==
len
(
encoded_sequence_w_special
)
filtered_sequence
=
[(
x
if
sequence_ids
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
assert
encoded_sequence
==
filtered_sequence
# Testing inputs pairs
encoded_sequence
=
tokenizer
.
encode
(
sequence_0
)
+
tokenizer
.
encode
(
sequence_1
)
encoded_sequence_dict
=
tokenizer
.
encode_plus
(
sequence_0
,
sequence_1
,
add_special_tokens
=
True
)
encoded_sequence_w_special
=
encoded_sequence_dict
[
"input_ids"
]
sequence_ids
=
encoded_sequence_dict
[
"sequence_ids"
]
assert
len
(
sequence_ids
)
==
len
(
encoded_sequence_w_special
)
filtered_sequence
=
[(
x
if
sequence_ids
[
i
]
else
None
)
for
i
,
x
in
enumerate
(
encoded_sequence_w_special
)]
filtered_sequence
=
[
x
for
x
in
filtered_sequence
if
x
is
not
None
]
assert
encoded_sequence
==
filtered_sequence
transformers/tokenization_bert.py
View file @
2f259b22
...
...
@@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer):
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_sequence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
token_ids_1
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
+
([
1
]
*
len
(
token_ids_1
))
+
[
0
]
else
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
...
...
transformers/tokenization_roberta.py
View file @
2f259b22
...
...
@@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer):
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
get_sequence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
token_ids_1
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
,
0
]
+
([
1
]
*
len
(
token_ids_1
))
+
[
0
]
else
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
...
...
transformers/tokenization_utils.py
View file @
2f259b22
...
...
@@ -826,7 +826,21 @@ class PreTrainedTokenizer(object):
or PyTorch torch.Tensor instead of a list of python integers.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
"""
pair
=
bool
(
pair_ids
is
not
None
)
len_ids
=
len
(
ids
)
...
...
@@ -859,6 +873,7 @@ class PreTrainedTokenizer(object):
if
add_special_tokens
:
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids
,
pair_ids
)
if
pair
else
self
.
add_special_tokens_single_sequence
(
ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
if
pair
else
[
0
]
*
len
(
sequence
)
encoded_inputs
[
"sequence_ids"
]
=
self
.
get_sequence_ids
(
ids
,
pair_ids
)
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
...
...
@@ -893,6 +908,9 @@ class PreTrainedTokenizer(object):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The two sequences have been concatenated."
)
return
token_ids_0
+
token_ids_1
def
get_sequence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
):
return
[
1
]
*
((
len
(
token_ids_1
)
if
token_ids_1
else
0
)
+
len
(
token_ids_0
))
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
""" Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
...
...
transformers/tokenization_xlm.py
View file @
2f259b22
...
...
@@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_sequence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
token_ids_1
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
+
([
1
]
*
len
(
token_ids_1
))
+
[
0
]
else
:
return
[
0
]
+
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
...
...
transformers/tokenization_xlnet.py
View file @
2f259b22
...
...
@@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
get_sequence_ids
(
self
,
token_ids_0
,
token_ids_1
=
None
):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if
token_ids_1
:
return
([
1
]
*
len
(
token_ids_0
))
+
[
0
]
+
([
1
]
*
len
(
token_ids_1
))
+
[
0
,
0
]
else
:
return
([
1
]
*
len
(
token_ids_0
))
+
[
0
,
0
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids_1
):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment