"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "296252c49e640f252f87de037db8a62e1b9d23e1"
Unverified Commit 1bfa3477 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[Tests] Speed up tokenizer tests (#14964)

* speed up canine and mluke

* speed up mbart and mbart50 toks

* upload files
parent f80775df
...@@ -39,11 +39,12 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -39,11 +39,12 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@cached_property @cached_property
def canine_tokenizer(self): def canine_tokenizer(self):
# TODO replace nielsr by google return CanineTokenizer.from_pretrained("google/canine-s")
return CanineTokenizer.from_pretrained("nielsr/canine-s")
def get_tokenizer(self, **kwargs) -> CanineTokenizer: def get_tokenizer(self, **kwargs) -> CanineTokenizer:
return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs) tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
tokenizer._unicode_vocab_size = 1024
return tokenizer
@require_torch @require_torch
def test_prepare_batch_integration(self): def test_prepare_batch_integration(self):
......
...@@ -1642,6 +1642,78 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -1642,6 +1642,78 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(len(tokens[key].shape), 3) self.assertEqual(len(tokens[key].shape), 3)
self.assertEqual(tokens[key].shape[-1], 4) self.assertEqual(tokens[key].shape[-1], 4)
# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
@unittest.skip("TO DO: overwrite this very extensive test.") @unittest.skip("TO DO: overwrite this very extensive test.")
def test_alignement_methods(self): def test_alignement_methods(self):
pass pass
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import os import os
import shutil
import tempfile import tempfile
import unittest import unittest
...@@ -122,6 +123,78 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -122,6 +123,78 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
], ],
) )
# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
@require_torch @require_torch
@require_sentencepiece @require_sentencepiece
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import os import os
import shutil
import tempfile import tempfile
import unittest import unittest
...@@ -34,7 +35,7 @@ RO_CODE = 250020 ...@@ -34,7 +35,7 @@ RO_CODE = 250020
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBart50Tokenizer tokenizer_class = MBart50Tokenizer
rust_tokenizer_class = MBart50TokenizerFast rust_tokenizer_class = MBart50TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
...@@ -113,11 +114,83 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -113,11 +114,83 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
revision="d3913889c59cd5c9e456b269c376325eabad57e2", revision="d3913889c59cd5c9e456b269c376325eabad57e2",
) )
# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-mbart50", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
@require_torch @require_torch
@require_sentencepiece @require_sentencepiece
@require_tokenizers @require_tokenizers
class MBartOneToManyIntegrationTest(unittest.TestCase): class MBart50OneToManyIntegrationTest(unittest.TestCase):
checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt" checkpoint_name = "facebook/mbart-large-50-one-to-many-mmt"
src_text = [ src_text = [
" UN Chief Says There Is No Military Solution in Syria", " UN Chief Says There Is No Military Solution in Syria",
......
...@@ -70,9 +70,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -70,9 +70,8 @@ class MLukeTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
[35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819], [35378, 8999, 38, 33273, 11676, 604, 365, 21392, 201, 1819],
) )
@slow
def test_sequence_builders(self): def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("studio-ousia/mluke-base") tokenizer = self.tokenizer_class.from_pretrained("hf-internal-testing/tiny-random-mluke")
text = tokenizer.encode("sequence builders", add_special_tokens=False) text = tokenizer.encode("sequence builders", add_special_tokens=False)
text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False) text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
......
...@@ -140,6 +140,78 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -140,6 +140,78 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
], ],
) )
# overwrite from test_tokenization_common to speed up test
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-xlm-roberta", {})
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer_p = self.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files + the tokenizer.json file for the fast one
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
# self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
# self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=True
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it save with the same files
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
# Save tokenizer rust, legacy_format=False
tmpdirname2 = tempfile.mkdtemp()
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
# Checks it saved the tokenizer.json file
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
# Checks everything loads correctly in the same way
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
# Check special tokens are set accordingly on Rust and Python
for key in tokenizer_pp.special_tokens_map:
self.assertTrue(hasattr(tokenizer_rp, key))
shutil.rmtree(tmpdirname2)
@cached_property @cached_property
def big_tokenizer(self): def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment