"docs/source/ko/tasks/language_modeling.md" did not exist on "c2c99dc7ef5edab8f7674a1eb00cf6ac6996fd0f"
Unverified Commit 11bbb505 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Adds pretrained IDs directly in the tests (#29534)

* Adds pretrained IDs directly in the tests

* Fix tests

* Fix tests

* Review!
parent 38bff8c8
...@@ -367,6 +367,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): ...@@ -367,6 +367,7 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/wav2vec2-base-960h"
tokenizer_class = Wav2Vec2CTCTokenizer tokenizer_class = Wav2Vec2CTCTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -28,6 +28,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
@require_phonemizer @require_phonemizer
class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/wav2vec2-lv-60-espeak-cv-ft"
tokenizer_class = Wav2Vec2PhonemeCTCTokenizer tokenizer_class = Wav2Vec2PhonemeCTCTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -31,6 +31,7 @@ NOTIMESTAMPS = 50363 ...@@ -31,6 +31,7 @@ NOTIMESTAMPS = 50363
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "openai/whisper-tiny"
tokenizer_class = WhisperTokenizer tokenizer_class = WhisperTokenizer
rust_tokenizer_class = WhisperTokenizerFast rust_tokenizer_class = WhisperTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XGLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "facebook/xglm-564M"
tokenizer_class = XGLMTokenizer tokenizer_class = XGLMTokenizer
rust_tokenizer_class = XGLMTokenizerFast rust_tokenizer_class = XGLMTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin ...@@ -25,6 +25,7 @@ from ...test_tokenization_common import TokenizerTesterMixin
class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/xlm-mlm-en-2048"
tokenizer_class = XLMTokenizer tokenizer_class = XLMTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
......
...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "microsoft/xprophetnet-large-wiki100-cased"
tokenizer_class = XLMProphetNetTokenizer tokenizer_class = XLMProphetNetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True test_sentencepiece = True
......
...@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -31,6 +31,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "FacebookAI/xlm-roberta-base"
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") ...@@ -27,6 +27,7 @@ SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
from_pretrained_id = "xlnet/xlnet-base-cased"
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
......
...@@ -186,6 +186,7 @@ class TokenizerTesterMixin: ...@@ -186,6 +186,7 @@ class TokenizerTesterMixin:
space_between_special_tokens = False space_between_special_tokens = False
from_pretrained_kwargs = None from_pretrained_kwargs = None
from_pretrained_filter = None from_pretrained_filter = None
from_pretrained_id = None
from_pretrained_vocab_key = "vocab_file" from_pretrained_vocab_key = "vocab_file"
test_seq2seq = True test_seq2seq = True
...@@ -200,19 +201,13 @@ class TokenizerTesterMixin: ...@@ -200,19 +201,13 @@ class TokenizerTesterMixin:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name) # information available in Tokenizer (name, rust class, python class, vocab key name)
if self.test_rust_tokenizer: if self.test_rust_tokenizer:
tokenizers_list = [ self.tokenizers_list = [
( (
self.rust_tokenizer_class, self.rust_tokenizer_class,
pretrained_name, self.from_pretrained_id,
self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {}, self.from_pretrained_kwargs if self.from_pretrained_kwargs is not None else {},
) )
for pretrained_name in self.rust_tokenizer_class.pretrained_vocab_files_map[
self.from_pretrained_vocab_key
].keys()
if self.from_pretrained_filter is None
or (self.from_pretrained_filter is not None and self.from_pretrained_filter(pretrained_name))
] ]
self.tokenizers_list = tokenizers_list[:1] # Let's just test the first pretrained vocab for speed
else: else:
self.tokenizers_list = [] self.tokenizers_list = []
with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data: with open(f"{get_tests_dir()}/fixtures/sample_text.txt", encoding="utf-8") as f_data:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment