Unverified Commit 37ed3ab7 authored by Philip May's avatar Philip May Committed by GitHub
Browse files

Enable option for subword regularization in more tokenizers. (#11417)

* improve slow class tok usage at xlm rob

* add subword regularization for barthez

* improve barthez tok. test

* fix tokenizer tests

* add subword regularization for camembert

* add subword regularization for deberta v2 tokenizer

* add more doc to deberta v2 tokenizer

* add subword regularization for speech to text tok.

* fix sp_model_kwargs type in speech 2 text tok.

* add subword regularization for M2M100 tok.

* add more concrete type hints

* fix tests for m2m100 and s2t tok.

* add missing Any import

* fix syntax error in m2m100 tok.

* fix unpickle of m2m100 and s2t tok.

* fix test of m2m100 and s2t tok.

* improve unpickle of deberta v2 tok.

* add test for pickle of barthez & camembert

* fix pickle of barthez & camembert

* add test for deberta v2 tok. pickle

* fix m2m100 tok. pickle

* fix s2t tok. pickle

* add subword regularization to albert tok.

* refactor subword reg. test into TokenizerTesterMixin

improve albert tok. test

remove sample argument form albert tok.

check subword reg. using TokenizerTesterMixin

improve tok. tests

improve xlm roberta tok. tests

improve xlm roberta tok. tests

* add subword regularization for big bird t.

* improve xlm roberta tok. test

* add subword regularization for mbart50 tok.

* add subword regularization for pegasus tok.

* add subword regularization for reformer tok.

* add subword regularization for T5 tok.

* fix t5 tok. test formatting

* add subword regularization for xlm_proph. tok.

* add subword regularization for xlnet tok.

* add subword regularization for gert_gen tok.

* add typing to tokenizers

* add typing to xlm rob. tok

* add subword regularization for marian tok.

* add reverse tok. test

* fix marian tok test

* fix marian tok test

* fix casing in tok. tests

* fix style of tok. common test

* fix deberta v2 tok test

* add type annotations to tok. tests

* add type annotations to tok. __init__

* add typing to kokenizer

* add type annotations to tok. __init__

* don't specify the default when it's None

* fix barthez tok. doc

* move sentencepiece tok. tests to TokenizerTesterMixin

* fix unused imports

* fix albert tok. test

* add comment to sentencepiece test options

* fix Any import at big bird tok.

* fix Any import at xlm prophetnet tok.

* empty commit to trigger CI
parent fa84540e
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
...@@ -37,6 +36,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -37,6 +36,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CamembertTokenizer tokenizer_class = CamembertTokenizer
rust_tokenizer_class = CamembertTokenizerFast rust_tokenizer_class = CamembertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import inspect import inspect
import itertools
import os import os
import pickle import pickle
import re import re
...@@ -100,6 +101,13 @@ class TokenizerTesterMixin: ...@@ -100,6 +101,13 @@ class TokenizerTesterMixin:
from_pretrained_vocab_key = "vocab_file" from_pretrained_vocab_key = "vocab_file"
test_seq2seq = True test_seq2seq = True
# set to True to test a sentencepiece tokenizer
test_sentencepiece = False
# set to True to ignore casing when testing a sentencepiece tokenizer
# test_sentencepiece must also be set to True
test_sentencepiece_ignore_case = False
def setUp(self) -> None: def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name) # information available in Tokenizer (name, rust class, python class, vocab key name)
...@@ -216,6 +224,38 @@ class TokenizerTesterMixin: ...@@ -216,6 +224,38 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"])) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer)
def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
tokenizer_bin = pickle.dumps(tokenizer)
del tokenizer
tokenizer_new = pickle.loads(tokenizer_bin)
self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer_new)
def test_model_input_names_signature(self): def test_model_input_names_signature(self):
accepted_model_main_input_names = [ accepted_model_main_input_names = [
"input_ids", # nlp models "input_ids", # nlp models
...@@ -1727,6 +1767,46 @@ class TokenizerTesterMixin: ...@@ -1727,6 +1767,46 @@ class TokenizerTesterMixin:
# add pad_token_id to pass subsequent tests # add pad_token_id to pass subsequent tests
tokenizer.add_special_tokens({"pad_token": "<PAD>"}) tokenizer.add_special_tokens({"pad_token": "<PAD>"})
def check_subword_sampling(
self,
tokenizer: PreTrainedTokenizer,
text: str = None,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
"""
text = "This is a test for subword regularization." if text is None else text
if self.test_sentencepiece_ignore_case:
text = text.lower()
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
self.assertTrue(subword_sampling_found)
# check if converting back to original text works
for tokens in tokens_list:
if self.test_sentencepiece_ignore_case:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
@require_torch @require_torch
@slow @slow
def test_torch_encode_plus_sent_to_model(self): def test_torch_encode_plus_sent_to_model(self):
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
...@@ -33,6 +32,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -33,6 +32,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = DebertaV2Tokenizer tokenizer_class = DebertaV2Tokenizer
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
test_sentencepiece_ignore_case = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -45,6 +45,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -45,6 +45,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = M2M100Tokenizer tokenizer_class = M2M100Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import tempfile import tempfile
import unittest import unittest
...@@ -50,6 +49,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -50,6 +49,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer tokenizer_class = MarianTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -38,6 +38,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -38,6 +38,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBart50Tokenizer tokenizer_class = MBart50Tokenizer
rust_tokenizer_class = MBart50TokenizerFast rust_tokenizer_class = MBart50TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -31,6 +31,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -31,6 +31,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -104,6 +105,7 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -104,6 +105,7 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
...@@ -34,6 +33,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -34,6 +33,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = ReformerTokenizerFast rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -40,6 +40,7 @@ ES_CODE = 10 ...@@ -40,6 +40,7 @@ ES_CODE = 10
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Speech2TextTokenizer tokenizer_class = Speech2TextTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
...@@ -40,6 +39,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -40,6 +39,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
...@@ -32,6 +31,7 @@ class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -32,6 +31,7 @@ class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMProphetNetTokenizer tokenizer_class = XLMProphetNetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
...@@ -13,10 +13,7 @@ ...@@ -13,10 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import itertools
import os import os
import pickle
import unittest import unittest
from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
...@@ -36,6 +33,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -36,6 +33,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
...@@ -120,41 +118,6 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -120,41 +118,6 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
], ],
) )
def test_subword_regularization_tokenizer(self):
# Subword regularization is only available for the slow tokenizer.
tokenizer = XLMRobertaTokenizer(
SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
)
# Subword regularization augments training data with subword sampling.
# This has a random component. We test if the tokenizer generates different
# results when subword regularization is enabled.
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
all_equal = True
for combination in combinations:
if combination[0] != combination[1]:
all_equal = False
self.assertFalse(all_equal)
def test_pickle_subword_regularization_tokenizer(self):
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs=sp_model_kwargs)
tokenizer_bin = pickle.dumps(tokenizer)
tokenizer_new = pickle.loads(tokenizer_bin)
self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
@cached_property @cached_property
def big_tokenizer(self): def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
...@@ -33,6 +32,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -33,6 +32,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment