Unverified Commit cc360649 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

up (#13988)

parent 5b6bd4e7
...@@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer): ...@@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
else: else:
tok_string = bytes([ord(token)]) tok_string = bytes([ord(token)])
bstring += tok_string bstring += tok_string
string = bstring.decode("utf-8") string = bstring.decode("utf-8", errors="ignore")
return string return string
# ByT5Tokenizer has no vocab file # ByT5Tokenizer has no vocab file
......
...@@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ...@@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
), ),
) )
def test_decode_single_bytes(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
tokenizer = tokenizer_class.from_pretrained(tmp_dir)
self.assertTrue(tokenizer.decode([255]) == "")
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list # tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
def test_pretrained_model_lists(self): def test_pretrained_model_lists(self):
pass pass
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment