Commit c10c7d59 authored by LysandreJik's avatar LysandreJik
Browse files

Mask computing in standalone method. Tests.

parent bf503158
...@@ -187,18 +187,18 @@ class CommonTestCases: ...@@ -187,18 +187,18 @@ class CommonTestCases:
for weights_list_2 in weights_lists_2: for weights_list_2 in weights_lists_2:
self.assertListEqual(weights_list, weights_list_2) self.assertListEqual(weights_list, weights_list_2)
# def test_mask_output(self): def test_mask_output(self):
# if sys.version_info <= (3, 0): if sys.version_info <= (3, 0):
# return return
#
# tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
#
# if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer": if tokenizer.add_special_tokens_sequence_pair.__qualname__.split('.')[0] != "PreTrainedTokenizer":
# seq_0 = "Test this method." seq_0 = "Test this method."
# seq_1 = "With these inputs." seq_1 = "With these inputs."
# information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True) information = tokenizer.encode_plus(seq_0, seq_1, add_special_tokens=True, output_mask=True)
# sequences, mask = information["sequence"], information["mask"] sequences, mask = information["sequence"], information["mask"]
# assert len(sequences) == len(mask) assert len(sequences) == len(mask)
def test_number_of_added_tokens(self): def test_number_of_added_tokens(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
......
...@@ -204,6 +204,18 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -204,6 +204,18 @@ class BertTokenizer(PreTrainedTokenizer):
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file.""" """Save the tokenizer vocabulary to a directory or file."""
index = 0 index = 0
......
...@@ -64,12 +64,18 @@ class DistilBertTokenizer(BertTokenizer): ...@@ -64,12 +64,18 @@ class DistilBertTokenizer(BertTokenizer):
def add_special_tokens_single_sequence(self, token_ids): def add_special_tokens_single_sequence(self, token_ids):
return token_ids return token_ids
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1, output_mask=False): def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
sep = [self.sep_token_id] sep = [self.sep_token_id]
if output_mask:
return (
token_ids_0 + sep + token_ids_1,
[0] * len(token_ids_0 + sep) + [1] * len(token_ids_1)
)
else:
return token_ids_0 + sep + token_ids_1 return token_ids_0 + sep + token_ids_1
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1)) * [1]
...@@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer): ...@@ -96,3 +96,15 @@ class RobertaTokenizer(GPT2Tokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A RoBERTa sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
\ No newline at end of file
...@@ -779,8 +779,8 @@ class PreTrainedTokenizer(object): ...@@ -779,8 +779,8 @@ class PreTrainedTokenizer(object):
second_sentence_tokens second_sentence_tokens
) )
# if output_mask: if output_mask:
# sequence, information["mask"] = encoded_sequence information["mask"] = self.create_mask_from_sequences(text, text_pair)
information["sequence"] = sequence information["sequence"] = sequence
else: else:
...@@ -797,6 +797,10 @@ class PreTrainedTokenizer(object): ...@@ -797,6 +797,10 @@ class PreTrainedTokenizer(object):
return information return information
def create_mask_from_sequences(self, sequence_0, sequence_1):
logger.warning("This tokenizer does not make use of special tokens.")
return [0] * len(self.encode(sequence_0)) + [1] * len(self.encode(sequence_1))
def add_special_tokens_single_sequence(self, token_ids): def add_special_tokens_single_sequence(self, token_ids):
logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.") logger.warning("This tokenizer does not make use of special tokens. The sequence has been returned with no modification.")
return token_ids return token_ids
......
...@@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer): ...@@ -770,6 +770,18 @@ class XLMTokenizer(PreTrainedTokenizer):
cls = [self.cls_token_id] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
An XLM sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
| first sequence | second sequence
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
return len(cls + self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1]
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
"""Save the tokenizer vocabulary and merge files to a directory.""" """Save the tokenizer vocabulary and merge files to a directory."""
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -198,9 +198,21 @@ class XLNetTokenizer(PreTrainedTokenizer): ...@@ -198,9 +198,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
sep = [self.sep_token_id] sep = [self.sep_token_id]
cls = [self.cls_token_id] cls = [self.cls_token_id]
cls_segment_ids = [2]
return token_ids_0 + sep + token_ids_1 + sep + cls return token_ids_0 + sep + token_ids_1 + sep + cls
def create_mask_from_sequences(self, sequence_0, sequence_1):
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2
| first sequence | second sequence | CLS segment ID
"""
sep = [self.sep_token_id]
cls = [self.cls_token_id]
cls_segment_id = [2]
return len(self.encode(sequence_0) + sep) * [0] + len(self.encode(sequence_1) + sep) * [1] + cls_segment_id
def save_vocabulary(self, save_directory): def save_vocabulary(self, save_directory):
""" Save the sentencepiece vocabulary (copy original file) and special tokens file """ Save the sentencepiece vocabulary (copy original file) and special tokens file
to a directory. to a directory.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment