Commit 2f259b22 authored by LysandreJik's avatar LysandreJik
Browse files

Sequence IDS

parent 7c789c33
...@@ -292,3 +292,33 @@ class CommonTestCases: ...@@ -292,3 +292,33 @@ class CommonTestCases:
assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input assert tokenizer.encode(tokens, add_special_tokens=True) == formatted_input
assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input assert tokenizer.encode(input_ids, add_special_tokens=True) == formatted_input
def test_sequence_ids(self):
tokenizer = self.get_tokenizer()
sequence_0 = "Encode this."
sequence_1 = "This one too please."
# Testing single inputs
encoded_sequence = tokenizer.encode(sequence_0)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"]
assert len(sequence_ids) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence
# Testing inputs pairs
encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
sequence_ids = encoded_sequence_dict["sequence_ids"]
assert len(sequence_ids) == len(encoded_sequence_w_special)
filtered_sequence = [(x if sequence_ids[i] else None) for i, x in enumerate(encoded_sequence_w_special)]
filtered_sequence = [x for x in filtered_sequence if x is not None]
assert encoded_sequence == filtered_sequence
...@@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -204,6 +204,24 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
......
...@@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -100,6 +100,24 @@ class RobertaTokenizer(GPT2Tokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0, 0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
......
...@@ -826,7 +826,21 @@ class PreTrainedTokenizer(object): ...@@ -826,7 +826,21 @@ class PreTrainedTokenizer(object):
or PyTorch torch.Tensor instead of a list of python integers. or PyTorch torch.Tensor instead of a list of python integers.
Return: Return:
a dictionary containing the `input_ids` as well as the `overflowing_tokens` if a `max_length` was given. A Dictionary of shape::
{
input_ids: list[int],
overflowing_tokens: list[int] if a ``max_length`` is specified, else None
sequence_ids: list[int] if ``add_special_tokens`` if set to ``True``
}
With the fields:
``input_ids``: list of tokens to be fed to a model
``overflowing_tokens``: list of overflowing tokens if a max length is specified.
``sequence_ids``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
tokens and 1 specifying sequence tokens.
""" """
pair = bool(pair_ids is not None) pair = bool(pair_ids is not None)
len_ids = len(ids) len_ids = len(ids)
...@@ -859,6 +873,7 @@ class PreTrainedTokenizer(object): ...@@ -859,6 +873,7 @@ class PreTrainedTokenizer(object):
if add_special_tokens: if add_special_tokens:
sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids) sequence = self.add_special_tokens_sequence_pair(ids, pair_ids) if pair else self.add_special_tokens_single_sequence(ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) if pair else [0] * len(sequence)
encoded_inputs["sequence_ids"] = self.get_sequence_ids(ids, pair_ids)
else: else:
sequence = ids + pair_ids if pair else ids sequence = ids + pair_ids if pair else ids
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
...@@ -893,6 +908,9 @@ class PreTrainedTokenizer(object): ...@@ -893,6 +908,9 @@ class PreTrainedTokenizer(object):
logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.") logger.warning("This tokenizer does not make use of special tokens. The two sequences have been concatenated.")
return token_ids_0 + token_ids_1 return token_ids_0 + token_ids_1
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
return [1] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
def convert_ids_to_tokens(self, ids, skip_special_tokens=False): def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
""" Converts a single index or a sequence of indices (integers) in a token " """ Converts a single index or a sequence of indices (integers) in a token "
(resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
......
...@@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -770,6 +770,24 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return [0] + ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0]
else:
return [0] + ([1] * len(token_ids_0)) + [0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
......
...@@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -200,6 +200,24 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def get_sequence_ids(self, token_ids_0, token_ids_1=None):
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
Args:
token_ids_0: list of ids (must not contain special tokens)
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
for sequence pairs
Returns:
A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
"""
if token_ids_1:
return ([1] * len(token_ids_0)) + [0] + ([1] * len(token_ids_1)) + [0, 0]
else:
return ([1] * len(token_ids_0)) + [0, 0]
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1): def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1):
""" """
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment