Commit 9d44236f authored by LysandreJik's avatar LysandreJik
Browse files

Updated DistilBERT

parent ab984a8b
...@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, ...@@ -412,7 +412,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
output_mask=True, output_mask=True,
max_length=max_seq_length max_length=max_seq_length
) )
input_ids, segment_ids = inputs["sequence"], inputs["mask"] input_ids, segment_ids = inputs["input_ids"], inputs["output_token_type"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real # The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to. # tokens are attended to.
......
...@@ -196,8 +196,8 @@ class CommonTestCases: ...@@ -196,8 +196,8 @@ class CommonTestCases:
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True)
sequences, mask = information["sequence"], information["mask"] sequences, mask = information["input_ids"], information["output_token_type"]
assert len(sequences) == len(mask) assert len(sequences) == len(mask)
def test_number_of_added_tokens(self): def test_number_of_added_tokens(self):
...@@ -224,7 +224,7 @@ class CommonTestCases: ...@@ -224,7 +224,7 @@ class CommonTestCases:
total_length = len(sequence) + num_added_tokens total_length = len(sequence) + num_added_tokens
information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride) information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
truncated_sequence = information["sequence"] truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"] overflowing_tokens = information["overflowing_tokens"]
assert len(overflowing_tokens) == 2 + stride assert len(overflowing_tokens) == 2 + stride
...@@ -249,12 +249,12 @@ class CommonTestCases: ...@@ -249,12 +249,12 @@ class CommonTestCases:
) )
information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True, information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
stride=stride) stride=stride, truncate_first_sequence=False)
information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, information_first_truncated = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2,
add_special_tokens=True, stride=stride, add_special_tokens=True, stride=stride,
truncate_second_sequence_first=False) truncate_first_sequence=True)
truncated_sequence = information["sequence"] truncated_sequence = information["input_ids"]
overflowing_tokens = information["overflowing_tokens"] overflowing_tokens = information["overflowing_tokens"]
overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"] overflowing_tokens_first_truncated = information_first_truncated["overflowing_tokens"]
......
...@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object): ...@@ -536,13 +536,7 @@ class PreTrainedTokenizer(object):
if pair: if pair:
initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another")) initial_tokens_len = len(self.encode("This is a sequence") + self.encode("This is another"))
final_tokens = self.encode("This is a sequence", "This is another", add_special_tokens=True) final_tokens_len = len(self.encode("This is a sequence", "This is another", add_special_tokens=True))
# In some models (e.g. GPT-2), there is no sequence pair encoding.
if len(final_tokens) == 2:
return 0
else:
final_tokens_len = len(final_tokens)
else: else:
initial_tokens_len = len(self.encode("This is a sequence")) initial_tokens_len = len(self.encode("This is a sequence"))
final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True)) final_tokens_len = len(self.encode("This is a sequence", add_special_tokens=True))
...@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object): ...@@ -700,86 +694,93 @@ class PreTrainedTokenizer(object):
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
Args: Args:
text: The first sequence to be encoded. text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text_pair: Optional second sequence to be encoded. the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
**kwargs: passed to the `self.tokenize()` method
""" """
if text_pair is None: return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"]
if add_special_tokens:
sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
return self.add_special_tokens_single_sequence(sequence_tokens)
else:
ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text
return ids
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair
if add_special_tokens:
return self.add_special_tokens_sequence_pair(first_sentence_tokens, second_sentence_tokens)
else:
logger.warning("No special tokens were added. The two sequences have been concatenated.")
return first_sentence_tokens + second_sentence_tokens
def encode_plus(self, def encode_plus(self,
text, text,
text_pair=None, text_pair=None,
add_special_tokens=False, add_special_tokens=False,
output_mask=False, output_token_type=False,
max_length=None, max_length=None,
stride=0, stride=0,
truncate_second_sequence_first=True, truncate_first_sequence=True,
**kwargs): **kwargs):
""" """
Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this Returns a dictionary containing the encoded sequence or sequence pair. Other values can be returned by this
method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. method: the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
Args: Args:
text: The first sequence to be encoded. text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
text_pair: Optional second sequence to be encoded. the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method)
text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
string using the `tokenize` method) or a list of integers (tokenized string ids using the
`convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
output_mask: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence, output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
and 1 for the second. and 1 for the second.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
from the main sequence returned. The value of this argument defined the number of additional tokens. from the main sequence returned. The value of this argument defined the number of additional tokens.
truncate_second_sequence_first: if there is a specified max_length, this flag will choose which sequence truncate_first_sequence: if there is a specified max_length, this flag will choose which sequence
will be truncated. will be truncated.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
""" """
information = {} information = {}
def get_input_ids(text):
if isinstance(text, six.string_types):
input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
input_ids = self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
input_ids = text
else:
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
return input_ids
if text_pair is None: if text_pair is None:
sequence_tokens = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) if isinstance(text, six.string_types) else text sequence_tokens = get_input_ids(text)
if add_special_tokens: if add_special_tokens:
information = self.prepare_for_model(sequence_tokens, max_length, stride) information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride)
else: else:
if max_length: if max_length:
information["overflowing_tokens"] = sequence_tokens[max_length - stride:] information["overflowing_tokens"] = sequence_tokens[max_length - stride:]
sequence_tokens = sequence_tokens[:max_length] sequence_tokens = sequence_tokens[:max_length]
information["sequence"] = sequence_tokens information["input_ids"] = sequence_tokens
if output_mask: if output_token_type:
information["mask"] = [0] * len(information["sequence"]) information["output_token_type"] = [0] * len(information["input_ids"])
else: else:
first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text, **kwargs)] if isinstance(text, six.string_types) else text first_sentence_tokens = get_input_ids(text)
second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair, **kwargs)] if isinstance(text_pair, six.string_types) else text_pair second_sentence_tokens = get_input_ids(text_pair)
if add_special_tokens: if add_special_tokens:
information = self.prepare_pair_for_model( information = self.prepare_pair_for_model(
first_sentence_tokens, first_sentence_tokens,
second_sentence_tokens, second_sentence_tokens,
max_length, max_length=max_length,
truncate_second_sequence_first, truncate_first_sequence=truncate_first_sequence,
stride stride=stride
) )
if output_mask: if output_token_type:
information["mask"] = self.create_mask_from_sequences(text, text_pair) information["output_token_type"] = self.create_mask_from_sequences(text, text_pair)
else: else:
logger.warning("No special tokens were added. The two sequences have been concatenated.") logger.warning("No special tokens were added. The two sequences have been concatenated.")
sequence = first_sentence_tokens + second_sentence_tokens sequence = first_sentence_tokens + second_sentence_tokens
...@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object): ...@@ -787,43 +788,78 @@ class PreTrainedTokenizer(object):
if max_length: if max_length:
information["overflowing_tokens"] = sequence[max_length - stride:] information["overflowing_tokens"] = sequence[max_length - stride:]
sequence = sequence[:max_length] sequence = sequence[:max_length]
if output_mask: if output_token_type:
information["mask"] = [0] * len(sequence) information["output_token_type"] = [0] * len(sequence)
information["sequence"] = sequence information["input_ids"] = sequence
return information return information
def prepare_for_model(self, ids, max_length=None, stride=0): def prepare_for_model(self, ids, max_length=None, stride=0):
"""
Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates
sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
information = {} information = {}
n_added_tokens = self.num_added_tokens()
if max_length: if max_length:
n_added_tokens = self.num_added_tokens()
information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:] information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
ids = ids[:max_length - n_added_tokens] ids = ids[:max_length - n_added_tokens]
information["sequence"] = self.add_special_tokens_single_sequence(ids) information["input_ids"] = self.add_special_tokens_single_sequence(ids)
return information return information
def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_second_sequence_first=True, stride=0): def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
"""
Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first
sequence if the total size is superior than the specified `max_length`. If set to `False`, will
truncate the second sequence instead.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
f_len, s_len = len(ids_0), len(ids_1) f_len, s_len = len(ids_0), len(ids_1)
n_added_tokens = self.num_added_tokens(pair=True)
information = {} information = {}
if max_length: if max_length:
n_added_tokens = self.num_added_tokens(pair=True)
if len(ids_0) + n_added_tokens >= max_length: if len(ids_0) + n_added_tokens >= max_length:
logger.warning( logger.warning(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated.") "The first sequence is longer than the maximum specified length. This sequence will not be truncated.")
else: else:
if f_len + s_len + self.num_added_tokens(pair=True) > max_length: if f_len + s_len + self.num_added_tokens(pair=True) > max_length:
if truncate_second_sequence_first: if truncate_first_sequence:
information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
ids_1 = ids_1[:max_length - f_len - n_added_tokens]
else:
information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:] information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:]
ids_0 = ids_0[:max_length - s_len - n_added_tokens] ids_0 = ids_0[:max_length - s_len - n_added_tokens]
else:
information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:]
ids_1 = ids_1[:max_length - f_len - n_added_tokens]
sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1) sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1)
information["sequence"] = sequence information["input_ids"] = sequence
return information return information
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment