Unverified Commit cbad90d8 authored by Lysandre Debut's avatar Lysandre Debut Committed by GitHub
Browse files

Fix + Test (#8049)

parent 664c7ec4
......@@ -166,6 +166,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
tokens = token.split(" ")
words = []
for token in tokens:
if not len(token):
continue
token = token.lower()
word = tuple(token)
word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
......
......@@ -75,6 +75,15 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
assert src_text != decoded # I wish it did!
assert decoded == "i am a small frog ."
def test_empty_word_small_tok(self):
tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
src_text = "I am a small frog ."
src_text_dot = "."
encoded = tok(src_text)["input_ids"]
encoded_dot = tok(src_text_dot)["input_ids"]
assert encoded[-1] == encoded_dot[0]
class Blenderbot3BTokenizerTests(unittest.TestCase):
@cached_property
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment