Commit 72402d1a authored by LysandreJik's avatar LysandreJik
Browse files

Fixed DistilBERT tokenizer

parent d340e232
......@@ -39,8 +39,10 @@ class DistilBertTokenizationTest(BertTokenizationTest):
encoded_sentence = tokenizer.add_special_tokens_single_sequence(text)
encoded_pair = tokenizer.add_special_tokens_sequence_pair(text, text_2)
assert encoded_sentence == text
assert encoded_pair == text + [102] + text_2
assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + \
text_2 + [tokenizer.sep_token_id]
if __name__ == '__main__':
unittest.main()
......@@ -60,10 +60,3 @@ class DistilBertTokenizer(BertTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def add_special_tokens_single_sequence(self, token_ids):
return token_ids
def add_special_tokens_sequence_pair(self, token_ids_0, token_ids_1):
sep = [self.sep_token_id]
return token_ids_0 + sep + token_ids_1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment