Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9d44236f
Commit
9d44236f
authored
Sep 24, 2019
by
LysandreJik
Browse files
Updated DistilBERT
parent
ab984a8b
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
98 additions
and
62 deletions
+98
-62
examples/utils_glue.py
examples/utils_glue.py
+1
-1
pytorch_transformers/tests/tokenization_tests_commons.py
pytorch_transformers/tests/tokenization_tests_commons.py
+6
-6
pytorch_transformers/tokenization_utils.py
pytorch_transformers/tokenization_utils.py
+91
-55
No files found.
examples/utils_glue.py
View file @
9d44236f
...
@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
...
@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
output_mask
=
True
,
output_mask
=
True
,
max_length
=
max_seq_length
max_length
=
max_seq_length
)
)
input_ids
,
segment_ids
=
inputs
[
"
sequence
"
],
inputs
[
"
mask
"
]
input_ids
,
segment_ids
=
inputs
[
"
input_ids
"
],
inputs
[
"
output_token_type
"
]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
# tokens are attended to.
...
...
pytorch_transformers/tests/tokenization_tests_commons.py
View file @
9d44236f
...
@@ -196,8 +196,8 @@ class CommonTestCases:
...
@@ -196,8 +196,8 @@ class CommonTestCases:
if
tokenizer
.
add_special_tokens_sequence_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
if
tokenizer
.
add_special_tokens_sequence_pair
.
__qualname__
.
split
(
'.'
)[
0
]
!=
"PreTrainedTokenizer"
:
seq_0
=
"Test this method."
seq_0
=
"Test this method."
seq_1
=
"With these inputs."
seq_1
=
"With these inputs."
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_
mask
=
True
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
add_special_tokens
=
True
,
output_
token_type
=
True
)
sequences
,
mask
=
information
[
"
sequence
"
],
information
[
"
mask
"
]
sequences
,
mask
=
information
[
"
input_ids
"
],
information
[
"
output_token_type
"
]
assert
len
(
sequences
)
==
len
(
mask
)
assert
len
(
sequences
)
==
len
(
mask
)
def
test_number_of_added_tokens
(
self
):
def
test_number_of_added_tokens
(
self
):
...
@@ -224,7 +224,7 @@ class CommonTestCases:
...
@@ -224,7 +224,7 @@ class CommonTestCases:
total_length
=
len
(
sequence
)
+
num_added_tokens
total_length
=
len
(
sequence
)
+
num_added_tokens
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
max_length
=
total_length
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
truncated_sequence
=
information
[
"
sequence
"
]
truncated_sequence
=
information
[
"
input_ids
"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
assert
len
(
overflowing_tokens
)
==
2
+
stride
assert
len
(
overflowing_tokens
)
==
2
+
stride
...
@@ -249,12 +249,12 @@ class CommonTestCases:
...
@@ -249,12 +249,12 @@ class CommonTestCases:
)
)
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
information
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
)
stride
=
stride
,
truncate_first_sequence
=
False
)
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
information_first_truncated
=
tokenizer
.
encode_plus
(
seq_0
,
seq_1
,
max_length
=
len
(
sequence
)
-
2
,
add_special_tokens
=
True
,
stride
=
stride
,
add_special_tokens
=
True
,
stride
=
stride
,
truncate_
second
_sequence
_first
=
Fals
e
)
truncate_
first
_sequence
=
Tru
e
)
truncated_sequence
=
information
[
"
sequence
"
]
truncated_sequence
=
information
[
"
input_ids
"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens
=
information
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
overflowing_tokens_first_truncated
=
information_first_truncated
[
"overflowing_tokens"
]
...
...
pytorch_transformers/tokenization_utils.py
View file @
9d44236f
...
@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object):
...
@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object):
if
pair
:
if
pair
:
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
)
+
self
.
encode
(
"This is another"
))
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
)
+
self
.
encode
(
"This is another"
))
final_tokens
=
self
.
encode
(
"This is a sequence"
,
"This is another"
,
add_special_tokens
=
True
)
final_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
,
"This is another"
,
add_special_tokens
=
True
))
# In some models (e.g. GPT-2), there is no sequence pair encoding.
if
len
(
final_tokens
)
==
2
:
return
0
else
:
final_tokens_len
=
len
(
final_tokens
)
else
:
else
:
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
))
initial_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
))
final_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
,
add_special_tokens
=
True
))
final_tokens_len
=
len
(
self
.
encode
(
"This is a sequence"
,
add_special_tokens
=
True
))
...
@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object):
...
@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object):
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args:
Args:
text: The first sequence to be encoded.
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text_pair: Optional second sequence to be encoded.
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
to their model.
**kwargs: passed to the `self.tokenize()` method
"""
"""
if
text_pair
is
None
:
return
self
.
encode_plus
(
text
,
text_pair
,
add_special_tokens
,
**
kwargs
)[
"input_ids"
]
if
add_special_tokens
:
sequence_tokens
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
if
isinstance
(
text
,
six
.
string_types
)
else
text
return
self
.
add_special_tokens_single_sequence
(
sequence_tokens
)
else
:
ids
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
if
isinstance
(
text
,
six
.
string_types
)
else
text
return
ids
first_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text
,
**
kwargs
)]
if
isinstance
(
text
,
six
.
string_types
)
else
text
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
,
**
kwargs
)]
if
isinstance
(
text_pair
,
six
.
string_types
)
else
text_pair
if
add_special_tokens
:
return
self
.
add_special_tokens_sequence_pair
(
first_sentence_tokens
,
second_sentence_tokens
)
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
return
first_sentence_tokens
+
second_sentence_tokens
def
encode_plus
(
self
,
def
encode_plus
(
self
,
text
,
text
,
text_pair
=
None
,
text_pair
=
None
,
add_special_tokens
=
False
,
add_special_tokens
=
False
,
output_
mask
=
False
,
output_
token_type
=
False
,
max_length
=
None
,
max_length
=
None
,
stride
=
0
,
stride
=
0
,
truncate_
second
_sequence
_first
=
True
,
truncate_
first
_sequence
=
True
,
**
kwargs
):
**
kwargs
):
"""
"""
Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args:
Args:
text: The first sequence to be encoded.
text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text_pair: Optional second sequence to be encoded.
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model.
to their model.
output_
mask
: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
output_
token_type
: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
and 1 for the second.
and 1 for the second.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary
If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens.
from the main sequence returned. The value of this argument defined the number of additional tokens.
truncate_
second
_sequence
_first
: if there is a specified max_length, this flag will choose which sequence
truncate_
first
_sequence: if there is a specified max_length, this flag will choose which sequence
will be truncated.
will be truncated.
**kwargs: passed to the `self.tokenize()` method
**kwargs: passed to the `self.tokenize()` method
"""
"""
information
=
{}
information
=
{}
def
get_input_ids
(
text
):
if
isinstance
(
text
,
six
.
string_types
):
input_ids
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
str
):
input_ids
=
self
.
convert_tokens_to_ids
(
text
)
elif
isinstance
(
text
,
(
list
,
tuple
))
and
len
(
text
)
>
0
and
isinstance
(
text
[
0
],
int
):
input_ids
=
text
else
:
raise
ValueError
(
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
)
return
input_ids
if
text_pair
is
None
:
if
text_pair
is
None
:
sequence_tokens
=
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
,
**
kwargs
))
if
isinstance
(
text
,
six
.
string_types
)
else
text
sequence_tokens
=
get_input_ids
(
text
)
if
add_special_tokens
:
if
add_special_tokens
:
information
=
self
.
prepare_for_model
(
sequence_tokens
,
max_length
,
stride
)
information
=
self
.
prepare_for_model
(
sequence_tokens
,
max_length
=
max_length
,
stride
=
stride
)
else
:
else
:
if
max_length
:
if
max_length
:
information
[
"overflowing_tokens"
]
=
sequence_tokens
[
max_length
-
stride
:]
information
[
"overflowing_tokens"
]
=
sequence_tokens
[
max_length
-
stride
:]
sequence_tokens
=
sequence_tokens
[:
max_length
]
sequence_tokens
=
sequence_tokens
[:
max_length
]
information
[
"
sequence
"
]
=
sequence_tokens
information
[
"
input_ids
"
]
=
sequence_tokens
if
output_
mask
:
if
output_
token_type
:
information
[
"
mask
"
]
=
[
0
]
*
len
(
information
[
"
sequence
"
])
information
[
"
output_token_type
"
]
=
[
0
]
*
len
(
information
[
"
input_ids
"
])
else
:
else
:
first_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text
,
**
kwargs
)]
if
isinstance
(
text
,
six
.
string_types
)
else
text
first_sentence_tokens
=
get_input_ids
(
text
)
second_sentence_tokens
=
[
self
.
_convert_token_to_id
(
token
)
for
token
in
self
.
tokenize
(
text_pair
,
**
kwargs
)]
if
isinstance
(
text_pair
,
six
.
string_types
)
else
text_pair
second_sentence_tokens
=
get_input_ids
(
text_pair
)
if
add_special_tokens
:
if
add_special_tokens
:
information
=
self
.
prepare_pair_for_model
(
information
=
self
.
prepare_pair_for_model
(
first_sentence_tokens
,
first_sentence_tokens
,
second_sentence_tokens
,
second_sentence_tokens
,
max_length
,
max_length
=
max_length
,
truncate_
second
_sequence
_first
,
truncate_
first
_sequence
=
truncate_first_sequence
,
stride
stride
=
stride
)
)
if
output_
mask
:
if
output_
token_type
:
information
[
"
mask
"
]
=
self
.
create_mask_from_sequences
(
text
,
text_pair
)
information
[
"
output_token_type
"
]
=
self
.
create_mask_from_sequences
(
text
,
text_pair
)
else
:
else
:
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
logger
.
warning
(
"No special tokens were added. The two sequences have been concatenated."
)
sequence
=
first_sentence_tokens
+
second_sentence_tokens
sequence
=
first_sentence_tokens
+
second_sentence_tokens
...
@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object):
...
@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object):
if
max_length
:
if
max_length
:
information
[
"overflowing_tokens"
]
=
sequence
[
max_length
-
stride
:]
information
[
"overflowing_tokens"
]
=
sequence
[
max_length
-
stride
:]
sequence
=
sequence
[:
max_length
]
sequence
=
sequence
[:
max_length
]
if
output_
mask
:
if
output_
token_type
:
information
[
"
mask
"
]
=
[
0
]
*
len
(
sequence
)
information
[
"
output_token_type
"
]
=
[
0
]
*
len
(
sequence
)
information
[
"
sequence
"
]
=
sequence
information
[
"
input_ids
"
]
=
sequence
return
information
return
information
def
prepare_for_model
(
self
,
ids
,
max_length
=
None
,
stride
=
0
):
def
prepare_for_model
(
self
,
ids
,
max_length
=
None
,
stride
=
0
):
"""
Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
information
=
{}
information
=
{}
n_added_tokens
=
self
.
num_added_tokens
()
if
max_length
:
if
max_length
:
n_added_tokens
=
self
.
num_added_tokens
()
information
[
"overflowing_tokens"
]
=
ids
[
max_length
-
n_added_tokens
-
stride
:]
information
[
"overflowing_tokens"
]
=
ids
[
max_length
-
n_added_tokens
-
stride
:]
ids
=
ids
[:
max_length
-
n_added_tokens
]
ids
=
ids
[:
max_length
-
n_added_tokens
]
information
[
"
sequence
"
]
=
self
.
add_special_tokens_single_sequence
(
ids
)
information
[
"
input_ids
"
]
=
self
.
add_special_tokens_single_sequence
(
ids
)
return
information
return
information
def
prepare_pair_for_model
(
self
,
ids_0
,
ids_1
,
max_length
=
None
,
truncate_second_sequence_first
=
True
,
stride
=
0
):
def
prepare_pair_for_model
(
self
,
ids_0
,
ids_1
,
max_length
=
None
,
truncate_first_sequence
=
True
,
stride
=
0
):
"""
Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
sequence if the total size is superior than the specified `max_length`. If set to `False`, will
truncate the second sequence instead.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
f_len
,
s_len
=
len
(
ids_0
),
len
(
ids_1
)
f_len
,
s_len
=
len
(
ids_0
),
len
(
ids_1
)
n_added_tokens
=
self
.
num_added_tokens
(
pair
=
True
)
information
=
{}
information
=
{}
if
max_length
:
if
max_length
:
n_added_tokens
=
self
.
num_added_tokens
(
pair
=
True
)
if
len
(
ids_0
)
+
n_added_tokens
>=
max_length
:
if
len
(
ids_0
)
+
n_added_tokens
>=
max_length
:
logger
.
warning
(
logger
.
warning
(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated."
)
"The first sequence is longer than the maximum specified length. This sequence will not be truncated."
)
else
:
else
:
if
f_len
+
s_len
+
self
.
num_added_tokens
(
pair
=
True
)
>
max_length
:
if
f_len
+
s_len
+
self
.
num_added_tokens
(
pair
=
True
)
>
max_length
:
if
truncate_second_sequence_first
:
if
truncate_first_sequence
:
information
[
"overflowing_tokens"
]
=
ids_1
[
max_length
-
f_len
-
n_added_tokens
-
stride
:]
ids_1
=
ids_1
[:
max_length
-
f_len
-
n_added_tokens
]
else
:
information
[
"overflowing_tokens"
]
=
ids_0
[
max_length
-
s_len
-
n_added_tokens
-
stride
:]
information
[
"overflowing_tokens"
]
=
ids_0
[
max_length
-
s_len
-
n_added_tokens
-
stride
:]
ids_0
=
ids_0
[:
max_length
-
s_len
-
n_added_tokens
]
ids_0
=
ids_0
[:
max_length
-
s_len
-
n_added_tokens
]
else
:
information
[
"overflowing_tokens"
]
=
ids_1
[
max_length
-
f_len
-
n_added_tokens
-
stride
:]
ids_1
=
ids_1
[:
max_length
-
f_len
-
n_added_tokens
]
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids_0
,
ids_1
)
sequence
=
self
.
add_special_tokens_sequence_pair
(
ids_0
,
ids_1
)
information
[
"
sequence
"
]
=
sequence
information
[
"
input_ids
"
]
=
sequence
return
information
return
information
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment