Unverified Commit 11bbb505 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Adds pretrained IDs directly in the tests (#29534)

* Adds pretrained IDs directly in the tests

* Fix tests

* Fix tests

* Review!
parent 38bff8c8
...@@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/phobert-base"
tokenizer_class = PhobertTokenizer tokenizer_class = PhobertTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -40,6 +40,7 @@ PYTHON_CODE = 50002 ...@@ -40,6 +40,7 @@ PYTHON_CODE = 50002
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "uclanlp/plbart-base"
tokenizer_class = PLBartTokenizer tokenizer_class = PLBartTokenizer
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -32,6 +32,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -32,6 +32,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/prophetnet-large-uncased"
tokenizer_class = ProphetNetTokenizer tokenizer_class = ProphetNetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "qwen/qwen-tokenizer"
tokenizer_class = Qwen2Tokenizer tokenizer_class = Qwen2Tokenizer
rust_tokenizer_class = Qwen2TokenizerFast rust_tokenizer_class = Qwen2TokenizerFast
test_slow_tokenizer = True test_slow_tokenizer = True
......
...@@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english ...@@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers @require_tokenizers
class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/realm-cc-news-pretrained-embedder"
tokenizer_class = RealmTokenizer tokenizer_class = RealmTokenizer
rust_tokenizer_class = RealmTokenizerFast rust_tokenizer_class = RealmTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/reformer-crime-and-punishment"
tokenizer_class = ReformerTokenizer tokenizer_class = ReformerTokenizer
rust_tokenizer_class = ReformerTokenizerFast rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -32,6 +32,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -32,6 +32,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/rembert"
tokenizer_class = RemBertTokenizer tokenizer_class = RemBertTokenizer
rust_tokenizer_class = RemBertTokenizerFast rust_tokenizer_class = RemBertTokenizerFast
space_between_special_tokens = True space_between_special_tokens = True
......
...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/roberta-base"
tokenizer_class = RobertaTokenizer tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english ...@@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers @require_tokenizers
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "weiweishi/roc-bert-base-zh"
tokenizer_class = RoCBertTokenizer tokenizer_class = RoCBertTokenizer
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_rjieba @require_rjieba
@require_tokenizers @require_tokenizers
class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "junnyu/roformer_chinese_small"
tokenizer_class = RoFormerTokenizer tokenizer_class = RoFormerTokenizer
rust_tokenizer_class = RoFormerTokenizerFast rust_tokenizer_class = RoFormerTokenizerFast
space_between_special_tokens = True space_between_special_tokens = True
......
...@@ -53,6 +53,7 @@ SMALL_TRAINING_CORPUS = [ ...@@ -53,6 +53,7 @@ SMALL_TRAINING_CORPUS = [
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/hf-seamless-m4t-medium"
tokenizer_class = SeamlessM4TTokenizer tokenizer_class = SeamlessM4TTokenizer
rust_tokenizer_class = SeamlessM4TTokenizerFast rust_tokenizer_class = SeamlessM4TTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -38,6 +38,7 @@ else: ...@@ -38,6 +38,7 @@ else:
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/siglip-base-patch16-224"
tokenizer_class = SiglipTokenizer tokenizer_class = SiglipTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -37,6 +37,7 @@ ES_CODE = 10 ...@@ -37,6 +37,7 @@ ES_CODE = 10
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/s2t-small-librispeech-asr"
tokenizer_class = Speech2TextTokenizer tokenizer_class = Speech2TextTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de"
tokenizer_class = Speech2Text2Tokenizer tokenizer_class = Speech2Text2Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model") ...@@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase): class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/speecht5_asr"
tokenizer_class = SpeechT5Tokenizer tokenizer_class = SpeechT5Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -25,6 +25,7 @@ class SqueezeBertTokenizationTest(BertTokenizationTest): ...@@ -25,6 +25,7 @@ class SqueezeBertTokenizationTest(BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
from_pretrained_id = "squeezebert/squeezebert-uncased"
def get_rust_tokenizer(self, **kwargs): def get_rust_tokenizer(self, **kwargs):
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
......
...@@ -38,6 +38,7 @@ else: ...@@ -38,6 +38,7 @@ else:
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google-t5/t5-small"
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -53,6 +53,7 @@ else: ...@@ -53,6 +53,7 @@ else:
@require_tokenizers @require_tokenizers
@require_pandas @require_pandas
class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/tapas-large-finetuned-sqa"
tokenizer_class = TapasTokenizer tokenizer_class = TapasTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = True space_between_special_tokens = True
......
...@@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_tokenizers @require_tokenizers
@require_pandas @require_pandas
class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/udop-large"
tokenizer_class = UdopTokenizer tokenizer_class = UdopTokenizer
rust_tokenizer_class = UdopTokenizerFast rust_tokenizer_class = UdopTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/mms-tts-eng"
tokenizer_class = VitsTokenizer tokenizer_class = VitsTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment