Unverified Commit 11bbb505 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Adds pretrained IDs directly in the tests (#29534)

* Adds pretrained IDs directly in the tests

* Fix tests

* Fix tests

* Review!
parent 38bff8c8
......@@ -367,6 +367,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/wav2vec2-base-960h"
tokenizer_class = Wav2Vec2CTCTokenizer
test_rust_tokenizer = False
......
......@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_phonemizer
class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/wav2vec2-lv-60-espeak-cv-ft"
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
test_rust_tokenizer = False
......
......@@ -31,6 +31,7 @@ NOTIMESTAMPS = 50363
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openai/whisper-tiny"
tokenizer_class = WhisperTokenizer
rust_tokenizer_class = WhisperTokenizerFast
test_rust_tokenizer = True
......
......@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/xglm-564M"
tokenizer_class = XGLMTokenizer
rust_tokenizer_class = XGLMTokenizerFast
test_rust_tokenizer = True
......
......@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/xlm-mlm-en-2048"
tokenizer_class = XLMTokenizer
test_rust_tokenizer = False
......
......@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased"
tokenizer_class = XLMProphetNetTokenizer
test_rust_tokenizer = False
test_sentencepiece = True
......
......@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/xlm-roberta-base"
tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True
......
......@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece
@require_tokenizers
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "xlnet/xlnet-base-cased"
tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True
......
......@@ -186,6 +186,7 @@ class TokenizerTesterMixin:
space_between_special_tokens = False
from_pretrained_kwargs = None
from_pretrained_filter = None
from_pretrained_id = None
from_pretrained_vocab_key = "vocab_file"
test_seq2seq = True
......@@ -200,19 +201,13 @@ class TokenizerTesterMixin:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name)
if self.test_rust_tokenizer:
tokenizers_list = [
self.tokenizers_list = [
(
self.rust_tokenizer_class,
pretrained_name,
self.from_pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
)
for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
self.from_pretrained_vocab_key
].keys()
if self.from_pretrained_filter is None
or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
]
self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed
else:
self.tokenizers_list = []
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment