Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
a6981076
Commit
a6981076
authored
Sep 24, 2019
by
thomwolf
Browse files
various updates
parent
72402d1a
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
61 additions
and
107 deletions
+61
-107
examples/utils_glue.py
examples/utils_glue.py
+0
-1
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+1
-1
pytorch_transformers/tokenization_bert.py
pytorch_transformers/tokenization_bert.py
+2
-2
pytorch_transformers/tokenization_roberta.py
pytorch_transformers/tokenization_roberta.py
+2
-2
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+52
-97
pytorch_transformers/tokenization_xlm.py
pytorch_transformers/tokenization_xlm.py
+2
-2
pytorch_transformers/tokenization_xlnet.py
pytorch_transformers/tokenization_xlnet.py
+2
-2
No files found.
examples/utils_glue.py
View file @
a6981076
...
@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
example
.
text_a
,
example
.
text_a
,
example
.
text_b
,
example
.
text_b
,
add_special_tokens
=
True
,
add_special_tokens
=
True
,
output_token_type
=
True
,
max_length
=
max_seq_length
,
max_length
=
max_seq_length
,
truncate_first_sequence
=
True
# We're truncating the first sequence as a priority
truncate_first_sequence
=
True
# We're truncating the first sequence as a priority
)
)
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
a6981076
...
@@ -196,7 +196,7 @@ class CommonTestCases:
...
@@ -196,7 +196,7 @@ class CommonTestCases:
if
tokenizer
.
add_special_tokens_sequence_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
if
tokenizer
.
add_special_tokens_sequence_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_token_type
=
True
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
)
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
sequences
,
mask
=
information
[
"input_ids"
],
information
[
"token_type_ids"
]
assert
len
(
sequences
)
==
len
(
mask
)
assert
len
(
sequences
)
==
len
(
mask
)
...
...
pytorch_transformers/tokenization_bert.py
View file @
a6981076
...
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
create_token_type_ids_from_sequences
(
self
,
sequence_0
,
sequence
_1
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids
_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
A BERT sequence pair mask has the following format:
...
@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer):
...
@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
len
(
cls
+
self
.
encode
(
sequence
_0
)
+
sep
)
*
[
0
]
+
len
(
self
.
encode
(
sequence
_1
)
+
sep
)
*
[
1
]
return
len
(
cls
+
token_ids
_0
+
sep
)
*
[
0
]
+
len
(
token_ids
_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
vocab_path
):
def
save_vocabulary
(
self
,
vocab_path
):
"""Save the tokenizer vocabulary to a directory or file."""
"""Save the tokenizer vocabulary to a directory or file."""
...
...
pytorch_transformers/tokenization_roberta.py
View file @
a6981076
...
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
...
@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
sep
+
token_ids_1
+
sep
def
create_token_type_ids_from_sequences
(
self
,
sequence_0
,
sequence
_1
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids
_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
A RoBERTa sequence pair mask has the following format:
...
@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer):
...
@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
len
(
cls
+
self
.
encode
(
sequence_0
)
+
sep
+
sep
)
*
[
0
]
+
len
(
self
.
encode
(
sequence_1
)
+
sep
)
*
[
1
]
return
len
(
cls
+
token_ids_0
+
sep
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
\ No newline at end of file
\ No newline at end of file
pytorch_transformers/tokenization_utils.py
View file @
a6981076
...
@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object):
...
@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object):
to their model.
to their model.
**kwargs: passed to the `self.tokenize()` method
**kwargs: passed to the `self.tokenize()` method
"""
"""
return
self
.
encode_plus
(
text
,
text_pair
,
add_special_tokens
,
**
kwargs
)[
"input_ids"
]
encoded_inputs
=
self
.
encode_plus
(
text
,
text_pair
=
text_pair
,
add_special_tokens
=
add_special_tokens
,
**
kwargs
)
return
encoded_inputs
[
"input_ids"
]
def
encode_plus
(
self
,
def
encode_plus
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
False
,
add_special_tokens
=
False
,
output_token_type
=
False
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncate_first_sequence
=
True
,
truncate_first_sequence
=
True
,
...
@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object):
...
@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object):
`convert_tokens_to_ids` method)
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
to their model.
output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
and 1 for the second.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
...
@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object):
...
@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object):
**kwargs: passed to the `self.tokenize()` method
**kwargs: passed to the `self.tokenize()` method
"""
"""
information
=
{}
def
get_input_ids
(
text
):
def
get_input_ids
(
text
):
if
isinstance
(
text
,
six
.
string_types
):
if
isinstance
(
text
,
six
.
string_types
):
input_ids
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
six
.
string_types
):
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
six
.
string_types
):
input_ids
=
self
.
convert_tokens_to_ids
(
text
)
return
self
.
convert_tokens_to_ids
(
text
)
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
input_ids
=
text
return
text
else
:
else
:
raise
ValueError
(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
raise
ValueError
(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
return
input_ids
first_ids
=
get_input_ids
(
text
)
second_ids
=
get_input_ids
(
text_pair
)
if
text_pair
is
not
None
else
None
if
text_pair
is
None
:
sequence_tokens
=
get_input_ids
(
text
)
if
add_special_tokens
:
return
self
.
prepare_for_model
(
first_ids
,
information
=
self
.
prepare_for_model
(
sequence_tokens
,
max_length
=
max_length
,
stride
=
stride
)
pair_ids
=
second_ids
,
else
:
max_length
=
max_length
,
if
max_length
:
add_special_tokens
=
add_special_tokens
,
information
[
"overflowing_tokens"
]
=
sequence_tokens
[
max_length
-
stride
:]
stride
=
stride
,
sequence_tokens
=
sequence_tokens
[:
max_length
]
truncate_first_sequence
=
truncate_first_sequence
)
information
[
"input_ids"
]
=
sequence_tokens
if
output_token_type
:
information
[
"token_type_ids"
]
=
[
0
]
*
len
(
information
[
"input_ids"
])
else
:
first_sentence_tokens
=
get_input_ids
(
text
)
second_sentence_tokens
=
get_input_ids
(
text_pair
)
if
add_special_tokens
:
information
=
self
.
prepare_pair_for_model
(
first_sentence_tokens
,
second_sentence_tokens
,
max_length
=
max_length
,
truncate_first_sequence
=
truncate_first_sequence
,
stride
=
stride
)
if
output_token_type
:
information
[
"token_type_ids"
]
=
self
.
create_token_type_ids_from_sequences
(
text
,
text_pair
)
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
sequence
=
first_sentence_tokens
+
second_sentence_tokens
if
max_length
:
information
[
"overflowing_tokens"
]
=
sequence
[
max_length
-
stride
:]
sequence
=
sequence
[:
max_length
]
if
output_token_type
:
information
[
"token_type_ids"
]
=
[
0
]
*
len
(
sequence
)
information
[
"input_ids"
]
=
sequence
def
prepare_for_model
(
self
,
ids
,
pair_ids
=
None
,
max_length
=
None
,
add_special_tokens
=
False
,
stride
=
0
,
truncate_first_sequence
=
True
):
return
information
def
prepare_for_model
(
self
,
ids
,
max_length
=
None
,
stride
=
0
):
"""
"""
Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
sequences if overflowing while taking into account the special tokens and manages a window stride for
sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
overflowing tokens
Args:
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
information
=
{}
if
max_length
:
n_added_tokens
=
self
.
num_added_tokens
()
information
[
"overflowing_tokens"
]
=
ids
[
max_length
-
n_added_tokens
-
stride
:]
ids
=
ids
[:
max_length
-
n_added_tokens
]
information
[
"input_ids"
]
=
self
.
add_special_tokens_single_sequence
(
ids
)
return
information
def
prepare_pair_for_model
(
self
,
ids_0
,
ids_1
,
max_length
=
None
,
truncate_first_sequence
=
True
,
stride
=
0
):
"""
Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
sequence if the total size is superior than the specified `max_length`. If set to `False`, will
to their model.
truncate the second sequence instead.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
list of inputs.
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
Return:
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
"""
f_len
,
s_len
=
len
(
ids_0
),
len
(
ids_1
)
pair
=
bool
(
pair_ids
is
not
None
)
information
=
{}
len_ids
=
len
(
ids
)
len_pair_ids
=
len
(
pair_ids
)
if
pair
else
0
encoded_inputs
=
{}
if
max_length
:
if
max_length
:
n_added_tokens
=
self
.
num_added_tokens
(
pair
=
True
)
n_added_tokens
=
self
.
num_added_tokens
(
pair
=
pair
)
if
add_special_tokens
else
0
if
len
(
ids_0
)
+
n_added_tokens
>=
max_length
:
if
pair
and
n_added_tokens
+
(
len_pair_ids
if
truncate_first_sequence
else
len_ids
)
>=
max_length
:
logger
.
warning
(
logger
.
warning
(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated."
)
"You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
"This pair of sequences will not be truncated."
)
else
:
else
:
if
f_len
+
s_len
+
self
.
num_added_tokens
(
pair
=
True
)
>
max_length
:
if
n_added_tokens
+
len_ids
+
len_pair_ids
>
max_length
:
if
truncate_first_sequence
:
if
truncate_first_sequence
or
not
pair
:
information
[
"overflowing_tokens"
]
=
ids_0
[
max_length
-
s_len
-
n_added_tokens
-
stride
:]
encoded_inputs
[
"overflowing_tokens"
]
=
ids
[
max_length
-
len_pair_ids
-
n_added_tokens
-
stride
:]
ids_0
=
ids_0
[:
max_length
-
s_len
-
n_added_tokens
]
ids
=
ids
[:
max_length
-
len_pair_ids
-
n_added_tokens
]
elif
not
truncate_first_sequence
and
pair
:
encoded_inputs
[
"overflowing_tokens"
]
=
pair_ids
[
max_length
-
len_ids
-
n_added_tokens
-
stride
:]
pair_ids
=
pair_ids
[:
max_length
-
len_ids
-
n_added_tokens
]
else
:
else
:
information
[
"overflowing_tokens"
]
=
ids_1
[
max_length
-
f_len
-
n_added_tokens
-
stride
:]
logger
.
warning
(
ids_1
=
ids_1
[:
max_length
-
f_len
-
n_added_tokens
]
"Cannot truncate second sequence as it is not provided. No truncation."
)
if
add_special_tokens
:
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids
,
pair_ids
)
if
pair
else
self
.
add_special_tokens_single_sequence
(
ids
)
token_type_ids
=
self
.
create_token_type_ids_from_sequences
(
ids
,
pair_ids
)
if
pair
else
[
0
]
*
len
(
sequence
)
else
:
sequence
=
ids
+
pair_ids
if
pair
else
ids
token_type_ids
=
[
0
]
*
len
(
ids
)
+
([
1
]
*
len
(
pair_ids
)
if
pair
else
[])
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids_0
,
ids_1
)
encoded_inputs
[
"input_ids"
]
=
sequence
information
[
"input_ids"
]
=
sequence
encoded_inputs
[
"token_type_ids"
]
=
token_type_ids
return
information
return
encoded_inputs
def
create_token_type_ids_from_sequences
(
self
,
sequence_0
,
sequence
_1
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids
_1
):
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
logger
.
warning
(
"This tokenizer does not make use of special tokens."
)
return
[
0
]
*
len
(
self
.
encode
(
sequence
_0
)
)
+
[
1
]
*
len
(
self
.
encode
(
sequence
_1
)
)
return
[
0
]
*
len
(
token_ids
_0
)
+
[
1
]
*
len
(
token_ids
_1
)
def
add_special_tokens_single_sequence
(
self
,
token_ids
):
def
add_special_tokens_single_sequence
(
self
,
token_ids
):
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
logger
.
warning
(
"This tokenizer does not make use of special tokens. The sequence has been returned with no modification."
)
...
...
pytorch_transformers/tokenization_xlm.py
View file @
a6981076
...
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
create_token_type_ids_from_sequences
(
self
,
sequence_0
,
sequence
_1
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids
_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
An XLM sequence pair mask has the following format:
...
@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer):
...
@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer):
sep
=
[
self
.
sep_token_id
]
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
len
(
cls
+
self
.
encode
(
sequence
_0
)
+
sep
)
*
[
0
]
+
len
(
self
.
encode
(
sequence
_1
)
+
sep
)
*
[
1
]
return
len
(
cls
+
token_ids
_0
+
sep
)
*
[
0
]
+
len
(
token_ids
_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
):
def
save_vocabulary
(
self
,
save_directory
):
"""Save the tokenizer vocabulary and merge files to a directory."""
"""Save the tokenizer vocabulary and merge files to a directory."""
...
...
pytorch_transformers/tokenization_xlnet.py
View file @
a6981076
...
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
return
token_ids_0
+
sep
+
token_ids_1
+
sep
+
cls
def
create_token_type_ids_from_sequences
(
self
,
sequence_0
,
sequence
_1
):
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
,
token_ids
_1
):
"""
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
A BERT sequence pair mask has the following format:
...
@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
...
@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls
=
[
self
.
cls_token_id
]
cls
=
[
self
.
cls_token_id
]
cls_segment_id
=
[
2
]
cls_segment_id
=
[
2
]
return
len
(
self
.
encode
(
sequence
_0
)
+
sep
)
*
[
0
]
+
len
(
self
.
encode
(
sequence
_1
)
+
sep
)
*
[
1
]
+
cls_segment_id
return
len
(
token_ids
_0
+
sep
)
*
[
0
]
+
len
(
token_ids
_1
+
sep
)
*
[
1
]
+
cls_segment_id
def
save_vocabulary
(
self
,
save_directory
):
def
save_vocabulary
(
self
,
save_directory
):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
""" Save the sentencepiece vocabulary (copy original file) and special tokens file
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment