"...static/style/git@developer.sourcefind.cn:OpenDAS/nni.git" did not exist on "cbb63c5bec618354a25583c0861f45d4a01d9812"
Unverified Commit 3aa37b94 authored by SaulLu's avatar SaulLu Committed by GitHub
Browse files

Add test for a WordLevel tokenizer model (#12437)

* add a test for a WordLevel tokenizer

* adapt common test to new tokenizer
parent 0d1f67e6
...@@ -3168,11 +3168,8 @@ class TokenizerTesterMixin: ...@@ -3168,11 +3168,8 @@ class TokenizerTesterMixin:
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
expected_result = "This is the first sentence" expected_result = "This is the first sentence"
# OpenAIGPT always lowercases and has no arg. if tokenizer.backend_tokenizer.normalizer is not None:
if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith( expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
"OpenAIGPT"
):
expected_result = expected_result.lower()
self.assertEqual(expected_result, decoded_input) self.assertEqual(expected_result, decoded_input)
# We check that the parameters of the tokenizer remained the same # We check that the parameters of the tokenizer remained the same
...@@ -3287,11 +3284,8 @@ class TokenizerTesterMixin: ...@@ -3287,11 +3284,8 @@ class TokenizerTesterMixin:
decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True) decoded_input = new_tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
expected_result = "This is the first sentence" expected_result = "This is the first sentence"
# OpenAIGPT always lowercases and has no arg. if tokenizer.backend_tokenizer.normalizer is not None:
if new_tokenizer.init_kwargs.get("do_lower_case", False) or tokenizer.__class__.__name__.startswith( expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
"OpenAIGPT"
):
expected_result = expected_result.lower()
self.assertEqual(expected_result, decoded_input) self.assertEqual(expected_result, decoded_input)
......
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import shutil
import tempfile
import unittest import unittest
from transformers import PreTrainedTokenizerFast from transformers import PreTrainedTokenizerFast
...@@ -33,9 +35,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -33,9 +35,12 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
super().setUp() super().setUp()
self.test_rust_tokenizer = True self.test_rust_tokenizer = True
self.tokenizers_list = [(PreTrainedTokenizerFast, "robot-test/dummy-tokenizer-fast", {})] model_paths = ["robot-test/dummy-tokenizer-fast", "robot-test/dummy-tokenizer-wordlevel"]
tokenizer = PreTrainedTokenizerFast.from_pretrained("robot-test/dummy-tokenizer-fast") # Inclusion of 2 tokenizers to test different types of models (Unigram and WordLevel for the moment)
self.tokenizers_list = [(PreTrainedTokenizerFast, model_path, {}) for model_path in model_paths]
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_paths[0])
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
...@@ -51,3 +56,37 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -51,3 +56,37 @@ class PreTrainedTokenizationFastTest(TokenizerTesterMixin, unittest.TestCase):
def test_rust_tokenizer_signature(self): def test_rust_tokenizer_signature(self):
# PreTrainedTokenizerFast doesn't have tokenizer_file in its signature # PreTrainedTokenizerFast doesn't have tokenizer_file in its signature
pass pass
def test_training_new_tokenizer(self):
tmpdirname_orig = self.tmpdirname
# Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel.
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
try:
self.tmpdirname = tempfile.mkdtemp()
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer.save_pretrained(self.tmpdirname)
super().test_training_new_tokenizer()
finally:
# Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer
# is restored
shutil.rmtree(self.tmpdirname)
self.tmpdirname = tmpdirname_orig
def test_training_new_tokenizer_with_special_tokens_change(self):
tmpdirname_orig = self.tmpdirname
# Here we want to test the 2 available tokenizers that use 2 different types of models: Unigram and WordLevel.
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
try:
self.tmpdirname = tempfile.mkdtemp()
tokenizer = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
tokenizer.save_pretrained(self.tmpdirname)
super().test_training_new_tokenizer_with_special_tokens_change()
finally:
# Even if the test fails, we must be sure that the folder is deleted and that the default tokenizer
# is restored
shutil.rmtree(self.tmpdirname)
self.tmpdirname = tmpdirname_orig
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment