Unverified Commit 11bbb505 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Adds pretrained IDs directly in the tests (#29534)

* Adds pretrained IDs directly in the tests

* Fix tests

* Fix tests

* Review!
parent 38bff8c8
......@@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class PhobertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/phobert-base"
tokenizer_class = PhobertTokenizer
test_rust_tokenizer = False
......
......@@ -40,6 +40,7 @@ PYTHON_CODE = 50002
@require_sentencepiece
@require_tokenizers
class PLBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "uclanlp/plbart-base"
tokenizer_class = PLBartTokenizer
rust_tokenizer_class = None
test_rust_tokenizer = False
......
......@@ -32,6 +32,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class ProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/prophetnet-large-uncased"
tokenizer_class = ProphetNetTokenizer
test_rust_tokenizer = False
......
......@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "qwen/qwen-tokenizer"
tokenizer_class = Qwen2Tokenizer
rust_tokenizer_class = Qwen2TokenizerFast
test_slow_tokenizer = True
......
......@@ -33,6 +33,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers
class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/realm-cc-news-pretrained-embedder"
tokenizer_class = RealmTokenizer
rust_tokenizer_class = RealmTokenizerFast
test_rust_tokenizer = True
......
......@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/reformer-crime-and-punishment"
tokenizer_class = ReformerTokenizer
rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True
......
......@@ -32,6 +32,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/rembert"
tokenizer_class = RemBertTokenizer
rust_tokenizer_class = RemBertTokenizerFast
space_between_special_tokens = True
......
......@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class RobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/roberta-base"
tokenizer_class = RobertaTokenizer
rust_tokenizer_class = RobertaTokenizerFast
test_rust_tokenizer = True
......
......@@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "weiweishi/roc-bert-base-zh"
tokenizer_class = RoCBertTokenizer
rust_tokenizer_class = None
test_rust_tokenizer = False
......
......@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_rjieba
@require_tokenizers
class RoFormerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "junnyu/roformer_chinese_small"
tokenizer_class = RoFormerTokenizer
rust_tokenizer_class = RoFormerTokenizerFast
space_between_special_tokens = True
......
......@@ -53,6 +53,7 @@ SMALL_TRAINING_CORPUS = [
@require_sentencepiece
@require_tokenizers
class SeamlessM4TTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/hf-seamless-m4t-medium"
tokenizer_class = SeamlessM4TTokenizer
rust_tokenizer_class = SeamlessM4TTokenizerFast
test_rust_tokenizer = True
......
......@@ -38,6 +38,7 @@ else:
@require_sentencepiece
@require_tokenizers
class SiglipTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/siglip-base-patch16-224"
tokenizer_class = SiglipTokenizer
test_rust_tokenizer = False
test_sentencepiece = True
......
......@@ -37,6 +37,7 @@ ES_CODE = 10
@require_sentencepiece
@require_tokenizers
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/s2t-small-librispeech-asr"
tokenizer_class = Speech2TextTokenizer
test_rust_tokenizer = False
test_sentencepiece = True
......
......@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/s2t-wav2vec2-large-en-de"
tokenizer_class = Speech2Text2Tokenizer
test_rust_tokenizer = False
......
......@@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
@require_sentencepiece
@require_tokenizers
class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/speecht5_asr"
tokenizer_class = SpeechT5Tokenizer
test_rust_tokenizer = False
test_sentencepiece = True
......
......@@ -25,6 +25,7 @@ class SqueezeBertTokenizationTest(BertTokenizationTest):
tokenizer_class = SqueezeBertTokenizer
rust_tokenizer_class = SqueezeBertTokenizerFast
test_rust_tokenizer = True
from_pretrained_id = "squeezebert/squeezebert-uncased"
def get_rust_tokenizer(self, **kwargs):
return SqueezeBertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs)
......
......@@ -38,6 +38,7 @@ else:
@require_sentencepiece
@require_tokenizers
class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google-t5/t5-small"
tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True
......
......@@ -53,6 +53,7 @@ else:
@require_tokenizers
@require_pandas
class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/tapas-large-finetuned-sqa"
tokenizer_class = TapasTokenizer
test_rust_tokenizer = False
space_between_special_tokens = True
......
......@@ -54,6 +54,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_tokenizers
@require_pandas
class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/udop-large"
tokenizer_class = UdopTokenizer
rust_tokenizer_class = UdopTokenizerFast
test_rust_tokenizer = True
......
......@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class VitsTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/mms-tts-eng"
tokenizer_class = VitsTokenizer
test_rust_tokenizer = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment