Unverified Commit 36434220 authored by Anthony MOI's avatar Anthony MOI Committed by GitHub
Browse files

[HUGE] Refactoring tokenizers backend - padding - truncation - pre-tokenized...


[HUGE] Refactoring tokenizers backend - padding - truncation - pre-tokenized pipeline - fast tokenizers - tests (#4510)

* Use tokenizers pre-tokenized pipeline

* failing pretrokenized test

* Fix is_pretokenized in python

* add pretokenized tests

* style and quality

* better tests for batched pretokenized inputs

* tokenizers clean up - new padding_strategy - split the files

* [HUGE] refactoring tokenizers - padding - truncation - tests

* style and quality

* bump up requied tokenizers version to 0.8.0-rc1

* switched padding/truncation API - simpler better backward compat

* updating tests for custom tokenizers

* style and quality - tests on pad

* fix QA pipeline

* fix backward compatibility for max_length only

* style and quality

* Various cleans up - add verbose

* fix tests

* update docstrings

* Fix tests

* Docs reformatted

* __call__ method documented
Co-authored-by: default avatarThomas Wolf <thomwolf@users.noreply.github.com>
Co-authored-by: default avatarLysandre <lysandre.debut@reseau.eseo.fr>
parent ebba39e4
...@@ -51,10 +51,10 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -51,10 +51,10 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname) tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer: def get_tokenizer(self, **kwargs) -> MarianTokenizer:
return MarianTokenizer.from_pretrained(self.tmpdirname, model_max_length=max_len, **kwargs) return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self, tokenizer):
return ( return (
"This is a test", "This is a test",
"This is a test", "This is a test",
......
...@@ -64,7 +64,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -64,7 +64,7 @@ class OpenAIGPTTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self): def get_input_output_texts(self, tokenizer):
return "lower newer", "lower newer" return "lower newer", "lower newer"
def test_full_tokenizer(self): def test_full_tokenizer(self):
......
...@@ -18,7 +18,7 @@ import json ...@@ -18,7 +18,7 @@ import json
import os import os
import unittest import unittest
from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer from transformers.tokenization_roberta import VOCAB_FILES_NAMES, RobertaTokenizer, RobertaTokenizerFast
from .test_tokenization_common import TokenizerTesterMixin from .test_tokenization_common import TokenizerTesterMixin
from .utils import slow from .utils import slow
...@@ -68,7 +68,11 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -68,7 +68,11 @@ class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
kwargs.update(self.special_tokens_map) kwargs.update(self.special_tokens_map)
return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs) return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_rust_tokenizer(self, **kwargs):
kwargs.update(self.special_tokens_map)
return RobertaTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
output_text = "lower newer" output_text = "lower newer"
return input_text, output_text return input_text, output_text
......
...@@ -56,7 +56,7 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -56,7 +56,7 @@ class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
kwargs["lower_case"] = True kwargs["lower_case"] = True
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs) return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self, tokenizer):
input_text = "<unk> UNwanted , running" input_text = "<unk> UNwanted , running"
output_text = "<unk> unwanted, running" output_text = "<unk> unwanted, running"
return input_text, output_text return input_text, output_text
......
...@@ -65,7 +65,7 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -65,7 +65,7 @@ class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_input_output_texts(self): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
output_text = "lower newer" output_text = "lower newer"
return input_text, output_text return input_text, output_text
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment