Unverified Commit 0c65fb7c authored by José Ángel Rey Liñares's avatar José Ángel Rey Liñares Committed by GitHub
Browse files

chore: allow protobuf 3.20.3 requirement (#22759)



* chore: allow protobuf 3.20.3

Allow latest bugfix release for protobuf (3.20.3)

* chore: update auto-generated dependency table

update auto-generated dependency table

* run in subprocess

* Apply suggestions from code review
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Apply suggestions

---------
Co-authored-by: default avatarydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: default avatarYih-Dar <2521628+ydshieh@users.noreply.github.com>
Co-authored-by: default avataramyeroberts <22614925+amyeroberts@users.noreply.github.com>
parent eb5b5ce6
...@@ -145,7 +145,7 @@ _deps = [ ...@@ -145,7 +145,7 @@ _deps = [
"packaging>=20.0", "packaging>=20.0",
"parameterized", "parameterized",
"phonemizer", "phonemizer",
"protobuf<=3.20.2", "protobuf<=3.20.3",
"psutil", "psutil",
"pyyaml>=5.1", "pyyaml>=5.1",
"pydantic", "pydantic",
......
...@@ -46,7 +46,7 @@ deps = { ...@@ -46,7 +46,7 @@ deps = {
"packaging": "packaging>=20.0", "packaging": "packaging>=20.0",
"parameterized": "parameterized", "parameterized": "parameterized",
"phonemizer": "phonemizer", "phonemizer": "phonemizer",
"protobuf": "protobuf<=3.20.2", "protobuf": "protobuf<=3.20.3",
"psutil": "psutil", "psutil": "psutil",
"pyyaml": "pyyaml>=5.1", "pyyaml": "pyyaml>=5.1",
"pydantic": "pydantic", "pydantic": "pydantic",
......
...@@ -23,6 +23,7 @@ import re ...@@ -23,6 +23,7 @@ import re
import shutil import shutil
import sys import sys
import tempfile import tempfile
import traceback
import unittest import unittest
import unittest.mock as mock import unittest.mock as mock
from collections import OrderedDict from collections import OrderedDict
...@@ -64,6 +65,7 @@ from transformers.testing_utils import ( ...@@ -64,6 +65,7 @@ from transformers.testing_utils import (
require_tf, require_tf,
require_tokenizers, require_tokenizers,
require_torch, require_torch,
run_test_in_subprocess,
slow, slow,
) )
from transformers.tokenization_utils import AddedToken, Trie from transformers.tokenization_utils import AddedToken, Trie
...@@ -131,6 +133,71 @@ def merge_model_tokenizer_mappings( ...@@ -131,6 +133,71 @@ def merge_model_tokenizer_mappings(
return model_tokenizer_mapping return model_tokenizer_mapping
def _test_subword_regularization_tokenizer(in_queue, out_queue, timeout):
error = None
try:
inputs = in_queue.get(timeout=timeout)
tokenizer = inputs["tokenizer"]
sp_model_kwargs = inputs["sp_model_kwargs"]
test_sentencepiece_ignore_case = inputs["test_sentencepiece_ignore_case"]
unittest.TestCase().assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
unittest.TestCase().assertIsNotNone(tokenizer.sp_model_kwargs)
unittest.TestCase().assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
unittest.TestCase().assertDictEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
check_subword_sampling(tokenizer, test_sentencepiece_ignore_case=test_sentencepiece_ignore_case)
except Exception:
error = f"{traceback.format_exc()}"
results = {"error": error}
out_queue.put(results, timeout=timeout)
out_queue.join()
def check_subword_sampling(
tokenizer: PreTrainedTokenizer,
text: str = None,
test_sentencepiece_ignore_case: bool = True,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
test_sentencepiece_ignore_case: See `TokenizerTesterMixin.test_sentencepiece_ignore_case`.
"""
text = "This is a test for subword regularization." if text is None else text
if test_sentencepiece_ignore_case:
text = text.lower()
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
unittest.TestCase().assertTrue(subword_sampling_found)
# check if converting back to original text works
for tokens in tokens_list:
if test_sentencepiece_ignore_case:
unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
unittest.TestCase().assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
class TokenizerTesterMixin: class TokenizerTesterMixin:
tokenizer_class = None tokenizer_class = None
rust_tokenizer_class = None rust_tokenizer_class = None
...@@ -420,11 +487,15 @@ class TokenizerTesterMixin: ...@@ -420,11 +487,15 @@ class TokenizerTesterMixin:
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
self.assertTrue(hasattr(tokenizer, "sp_model_kwargs")) run_test_in_subprocess(
self.assertIsNotNone(tokenizer.sp_model_kwargs) test_case=self,
self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict)) target_func=_test_subword_regularization_tokenizer,
self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs) inputs={
self.check_subword_sampling(tokenizer) "tokenizer": tokenizer,
"sp_model_kwargs": sp_model_kwargs,
"test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
},
)
def test_pickle_subword_regularization_tokenizer(self) -> None: def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece: if not self.test_sentencepiece:
...@@ -438,11 +509,15 @@ class TokenizerTesterMixin: ...@@ -438,11 +509,15 @@ class TokenizerTesterMixin:
del tokenizer del tokenizer
tokenizer_new = pickle.loads(tokenizer_bin) tokenizer_new = pickle.loads(tokenizer_bin)
self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs")) run_test_in_subprocess(
self.assertIsNotNone(tokenizer_new.sp_model_kwargs) test_case=self,
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) target_func=_test_subword_regularization_tokenizer,
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) inputs={
self.check_subword_sampling(tokenizer_new) "tokenizer": tokenizer_new,
"sp_model_kwargs": sp_model_kwargs,
"test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
},
)
def test_save_sentencepiece_tokenizer(self) -> None: def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer: if not self.test_sentencepiece or not self.test_slow_tokenizer:
...@@ -2317,46 +2392,6 @@ class TokenizerTesterMixin: ...@@ -2317,46 +2392,6 @@ class TokenizerTesterMixin:
# add pad_token_id to pass subsequent tests # add pad_token_id to pass subsequent tests
tokenizer.add_special_tokens({"pad_token": "<PAD>"}) tokenizer.add_special_tokens({"pad_token": "<PAD>"})
def check_subword_sampling(
self,
tokenizer: PreTrainedTokenizer,
text: str = None,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
"""
text = "This is a test for subword regularization." if text is None else text
if self.test_sentencepiece_ignore_case:
text = text.lower()
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
self.assertTrue(subword_sampling_found)
# check if converting back to original text works
for tokens in tokens_list:
if self.test_sentencepiece_ignore_case:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
@require_torch @require_torch
@slow @slow
def test_torch_encode_plus_sent_to_model(self): def test_torch_encode_plus_sent_to_model(self):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment