Unverified Commit 11bbb505 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Adds pretrained IDs directly in the tests (#29534)

* Adds pretrained IDs directly in the tests

* Fix tests

* Fix tests

* Review!
parent 38bff8c8
...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model") ...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "albert/albert-base-v1"
tokenizer_class = AlbertTokenizer tokenizer_class = AlbertTokenizer
rust_tokenizer_class = AlbertTokenizerFast rust_tokenizer_class = AlbertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_det
@require_tokenizers @require_tokenizers
class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase): class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/bart-base"
tokenizer_class = BartTokenizer tokenizer_class = BartTokenizer
rust_tokenizer_class = BartTokenizerFast rust_tokenizer_class = BartTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_sentencepiece @require_sentencepiece
@slow # see https://github.com/huggingface/transformers/issues/11457 @slow # see https://github.com/huggingface/transformers/issues/11457
class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "moussaKam/mbarthez"
tokenizer_class = BarthezTokenizer tokenizer_class = BarthezTokenizer
rust_tokenizer_class = BarthezTokenizerFast rust_tokenizer_class = BarthezTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -26,6 +26,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model") ...@@ -26,6 +26,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/bartpho-syllable"
tokenizer_class = BartphoTokenizer tokenizer_class = BartphoTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english ...@@ -34,6 +34,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
@require_tokenizers @require_tokenizers
class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google-bert/bert-base-uncased"
tokenizer_class = BertTokenizer tokenizer_class = BertTokenizer
rust_tokenizer_class = BertTokenizerFast rust_tokenizer_class = BertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -29,6 +29,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -29,6 +29,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/bert_for_seq_generation_L-24_bbc_encoder"
tokenizer_class = BertGenerationTokenizer tokenizer_class = BertGenerationTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -36,6 +36,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -36,6 +36,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@custom_tokenizers @custom_tokenizers
class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "cl-tohoku/bert-base-japanese"
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
space_between_special_tokens = True space_between_special_tokens = True
...@@ -403,6 +404,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -403,6 +404,7 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@custom_tokenizers @custom_tokenizers
class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "cl-tohoku/bert-base-japanese"
tokenizer_class = BertJapaneseTokenizer tokenizer_class = BertJapaneseTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -22,6 +22,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertweetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "vinai/bertweet-base"
tokenizer_class = BertweetTokenizer tokenizer_class = BertweetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -30,6 +30,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "google/bigbird-roberta-base"
tokenizer_class = BigBirdTokenizer tokenizer_class = BigBirdTokenizer
rust_tokenizer_class = BigBirdTokenizerFast rust_tokenizer_class = BigBirdTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -26,6 +26,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_sacremoses @require_sacremoses
class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BioGptTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/biogpt"
tokenizer_class = BioGptTokenizer tokenizer_class = BioGptTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/blenderbot_small-90M"
tokenizer_class = BlenderbotSmallTokenizer tokenizer_class = BlenderbotSmallTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "bigscience/tokenizer"
slow_tokenizer_class = None slow_tokenizer_class = None
rust_tokenizer_class = BloomTokenizerFast rust_tokenizer_class = BloomTokenizerFast
tokenizer_class = BloomTokenizerFast tokenizer_class = BloomTokenizerFast
......
...@@ -32,6 +32,7 @@ FRAMEWORK = "pt" if is_torch_available() else "tf" ...@@ -32,6 +32,7 @@ FRAMEWORK = "pt" if is_torch_available() else "tf"
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "almanach/camembert-base"
tokenizer_class = CamembertTokenizer tokenizer_class = CamembertTokenizer
rust_tokenizer_class = CamembertTokenizerFast rust_tokenizer_class = CamembertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "nielsr/canine-s"
tokenizer_class = CanineTokenizer tokenizer_class = CanineTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -27,6 +27,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openai/clip-vit-base-patch32"
tokenizer_class = CLIPTokenizer tokenizer_class = CLIPTokenizer
rust_tokenizer_class = CLIPTokenizerFast rust_tokenizer_class = CLIPTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, slow ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin, slow
class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "susnato/clvp_dev"
tokenizer_class = ClvpTokenizer tokenizer_class = ClvpTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
from_pretrained_kwargs = {"add_prefix_space": True} from_pretrained_kwargs = {"add_prefix_space": True}
......
...@@ -51,6 +51,7 @@ if is_torch_available(): ...@@ -51,6 +51,7 @@ if is_torch_available():
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "hf-internal-testing/llama-code-tokenizer"
tokenizer_class = CodeLlamaTokenizer tokenizer_class = CodeLlamaTokenizer
rust_tokenizer_class = CodeLlamaTokenizerFast rust_tokenizer_class = CodeLlamaTokenizerFast
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "Salesforce/codegen-350M-mono"
tokenizer_class = CodeGenTokenizer tokenizer_class = CodeGenTokenizer
rust_tokenizer_class = CodeGenTokenizerFast rust_tokenizer_class = CodeGenTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -24,6 +24,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_jieba @require_jieba
class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openbmb/cpm-ant-10b"
tokenizer_class = CpmAntTokenizer tokenizer_class = CpmAntTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -23,6 +23,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -23,6 +23,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class CTRLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "Salesforce/ctrl"
tokenizer_class = CTRLTokenizer tokenizer_class = CTRLTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_seq2seq = False test_seq2seq = False
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment