Commit a6981076 authored by thomwolf's avatar thomwolf
Browse files

various updates

parent 72402d1a
...@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length, ...@@ -409,7 +409,6 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
example.text_a, example.text_a,
example.text_b, example.text_b,
add_special_tokens=True, add_special_tokens=True,
output_token_type=True,
max_length=max_seq_length, max_length=max_seq_length,
truncate_first_sequence=True # We're truncating the first sequence as a priority truncate_first_sequence=True # We're truncating the first sequence as a priority
) )
......
...@@ -196,7 +196,7 @@ class CommonTestCases: ...@@ -196,7 +196,7 @@ class CommonTestCases:
if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
seq_0 = "Test this method." seq_0 = "Test this method."
seq_1 = "With these inputs." seq_1 = "With these inputs."
information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_token_type=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True)
sequences, mask = information["input_ids"], information["token_type_ids"] sequences, mask = information["input_ids"], information["token_type_ids"]
assert len(sequences) == len(mask) assert len(sequences) == len(mask)
......
...@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -204,7 +204,7 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: A BERT sequence pair mask has the following format:
...@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -214,7 +214,7 @@ class BertTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """Save the tokenizer vocabulary to a directory or file."""
......
...@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -97,7 +97,7 @@ class RobertaTokenizer(GPT2Tokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format: A RoBERTa sequence pair mask has the following format:
...@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -107,4 +107,4 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
\ No newline at end of file \ No newline at end of file
...@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object): ...@@ -704,13 +704,14 @@ class PreTrainedTokenizer(object):
to their model. to their model.
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
""" """
return self.encode_plus(text, text_pair, add_special_tokens, **kwargs)["input_ids"] encoded_inputs = self.encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
return encoded_inputs["input_ids"]
def encode_plus(self, def encode_plus(self,
text, text,
text_pair=None, text_pair=None,
add_special_tokens=False, add_special_tokens=False,
output_token_type=False,
max_length=None, max_length=None,
stride=0, stride=0,
truncate_first_sequence=True, truncate_first_sequence=True,
...@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object): ...@@ -728,8 +729,6 @@ class PreTrainedTokenizer(object):
`convert_tokens_to_ids` method) `convert_tokens_to_ids` method)
add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
to their model. to their model.
output_token_type: if set to ``True``, returns the text pair corresponding mask with 0 for the first sequence,
and 1 for the second.
max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. max_length: if set to a number, will limit the total sequence returned so that it has a maximum length.
If there are overflowing tokens, those will be added to the returned dictionary If there are overflowing tokens, those will be added to the returned dictionary
stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens
...@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object): ...@@ -739,133 +738,89 @@ class PreTrainedTokenizer(object):
**kwargs: passed to the `self.tokenize()` method **kwargs: passed to the `self.tokenize()` method
""" """
information = {}
def get_input_ids(text): def get_input_ids(text):
if isinstance(text, six.string_types): if isinstance(text, six.string_types):
input_ids = self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types):
input_ids = self.convert_tokens_to_ids(text) return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
input_ids = text return text
else: else:
raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.") raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
return input_ids first_ids = get_input_ids(text)
second_ids = get_input_ids(text_pair) if text_pair is not None else None
if text_pair is None:
sequence_tokens = get_input_ids(text)
if add_special_tokens: return self.prepare_for_model(first_ids,
information = self.prepare_for_model(sequence_tokens, max_length=max_length, stride=stride) pair_ids=second_ids,
else: max_length=max_length,
if max_length: add_special_tokens=add_special_tokens,
information["overflowing_tokens"] = sequence_tokens[max_length - stride:] stride=stride,
sequence_tokens = sequence_tokens[:max_length] truncate_first_sequence=truncate_first_sequence)
information["input_ids"] = sequence_tokens
if output_token_type:
information["token_type_ids"] = [0] * len(information["input_ids"])
else:
first_sentence_tokens = get_input_ids(text)
second_sentence_tokens = get_input_ids(text_pair)
if add_special_tokens:
information = self.prepare_pair_for_model(
first_sentence_tokens,
second_sentence_tokens,
max_length=max_length,
truncate_first_sequence=truncate_first_sequence,
stride=stride
)
if output_token_type:
information["token_type_ids"] = self.create_token_type_ids_from_sequences(text, text_pair)
else:
logger.warning("No special tokens were added. The two sequences have been concatenated.")
sequence = first_sentence_tokens + second_sentence_tokens
if max_length:
information["overflowing_tokens"] = sequence[max_length - stride:]
sequence = sequence[:max_length]
if output_token_type:
information["token_type_ids"] = [0] * len(sequence)
information["input_ids"] = sequence def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, truncate_first_sequence=True):
return information
def prepare_for_model(self, ids, max_length=None, stride=0):
""" """
Prepares a list of tokenized input ids so that it can be used by the model. It adds special tokens, truncates Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
It adds special tokens, truncates
sequences if overflowing while taking into account the special tokens and manages a window stride for sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens overflowing tokens
Args: Args:
ids: list of tokenized input ids. Can be obtained from a string by chaining the ids: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods. `tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs.
Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
"""
information = {}
if max_length:
n_added_tokens = self.num_added_tokens()
information["overflowing_tokens"] = ids[max_length - n_added_tokens - stride:]
ids = ids[:max_length - n_added_tokens]
information["input_ids"] = self.add_special_tokens_single_sequence(ids)
return information
def prepare_pair_for_model(self, ids_0, ids_1, max_length=None, truncate_first_sequence=True, stride=0):
"""
Prepares a list of tokenized input ids pair so that it can be used by the model. It adds special tokens,
truncates sequences if overflowing while taking into account the special tokens and manages a window stride for
overflowing tokens
Args:
ids_0: list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods.
ids_1: second list of tokenized input ids. Can be obtained from a string by chaining the
`tokenize` and `convert_tokens_to_ids` methods. `tokenize` and `convert_tokens_to_ids` methods.
max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
truncate_first_sequence: if set to `True`, alongside a specified `max_length`, will truncate the first add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
sequence if the total size is superior than the specified `max_length`. If set to `False`, will to their model.
truncate the second sequence instead.
stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
list of inputs. list of inputs.
truncate_first_sequence: if set to `True` and an optional second list of input ids is provided,
alongside a specified `max_length`, will truncate the first sequence if the total size is superior
than the specified `max_length`. If set to `False`, will truncate the second sequence instead.
Return: Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given.
""" """
f_len, s_len = len(ids_0), len(ids_1) pair = bool(pair_ids is not None)
information = {} len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
encoded_inputs = {}
if max_length: if max_length:
n_added_tokens = self.num_added_tokens(pair=True) n_added_tokens = self.num_added_tokens(pair=pair) if add_special_tokens else 0
if len(ids_0) + n_added_tokens >= max_length: if pair and n_added_tokens + (len_pair_ids if truncate_first_sequence else len_ids) >= max_length:
logger.warning( logger.warning(
"The first sequence is longer than the maximum specified length. This sequence will not be truncated.") "You supplied a pair of sequence in which the sequence that will not be truncated is longer than the maximum specified length."
"This pair of sequences will not be truncated.")
else: else:
if f_len + s_len + self.num_added_tokens(pair=True) > max_length: if n_added_tokens + len_ids + len_pair_ids > max_length:
if truncate_first_sequence: if truncate_first_sequence or not pair:
information["overflowing_tokens"] = ids_0[max_length - s_len - n_added_tokens - stride:] encoded_inputs["overflowing_tokens"] = ids[max_length - len_pair_ids - n_added_tokens - stride:]
ids_0 = ids_0[:max_length - s_len - n_added_tokens] ids = ids[:max_length - len_pair_ids - n_added_tokens]
elif not truncate_first_sequence and pair:
encoded_inputs["overflowing_tokens"] = pair_ids[max_length - len_ids - n_added_tokens - stride:]
pair_ids = pair_ids[:max_length - len_ids - n_added_tokens]
else: else:
information["overflowing_tokens"] = ids_1[max_length - f_len - n_added_tokens - stride:] logger.warning(
ids_1 = ids_1[:max_length - f_len - n_added_tokens] "Cannot truncate second sequence as it is not provided. No truncation.")
if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
else:
sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
sequence = self.add_special_tokens_sequence_pair(ids_0, ids_1) encoded_inputs["input_ids"] = sequence
information["input_ids"] = sequence encoded_inputs["token_type_ids"] = token_type_ids
return information return encoded_inputs
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
logger.warning("This tokenizer does not make use of special tokens.") logger.warning("This tokenizer does not make use of special tokens.")
return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1)) return [0] * len(token_ids_0) + [1] * len(token_ids_1)
def add_special_tokens_single_sequence(self, token_ids): def add_special_tokens_single_sequence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
......
...@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -770,7 +770,7 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format: An XLM sequence pair mask has the following format:
...@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -780,7 +780,7 @@ class XLMTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """Save the tokenizer vocabulary and merge files to a directory."""
......
...@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -200,7 +200,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def create_token_type_ids_from_sequences(self, sequence_0, sequence_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format: A BERT sequence pair mask has the following format:
...@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -211,7 +211,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
cls_segment_id = [2] cls_segment_id = [2]
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """ Save the sentencepiece vocabulary (copy original file) and special tokens file
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment