Unverified Commit fe23256b authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`SpeechT5Tokenization`] Add copied from and fix the...

[`SpeechT5Tokenization`]  Add copied from and fix the `convert_tokens_to_string` to match the fast decoding scheme (#28522)

* Add copied from and fix the `convert_tokens_to_string` to match the fast decoding scheme

* fixup

* add a small test

* style test file

* nites
parent 96d08831
......@@ -251,6 +251,7 @@ class BarthezTokenizer(PreTrainedTokenizer):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
......
......@@ -181,6 +181,7 @@ class BigBirdTokenizer(PreTrainedTokenizer):
token = self.sp_model.IdToPiece(index)
return token
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
......
......@@ -210,6 +210,7 @@ class FNetTokenizer(PreTrainedTokenizer):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.sp_model.IdToPiece(index)
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
......
......@@ -230,6 +230,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
return self.fairseq_ids_to_tokens[index]
return self.sp_model.IdToPiece(index - self.fairseq_offset)
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
......
......@@ -177,17 +177,23 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
token = self.sp_model.IdToPiece(index)
return token
# Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special:
out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string.strip()
......
......@@ -202,3 +202,17 @@ class SpeechT5TokenizerTest(TokenizerTesterMixin, unittest.TestCase):
revision="c5ef64c71905caeccde0e4462ef3f9077224c524",
sequences=sequences,
)
def test_encode_decode(self):
tokenizer = SpeechT5Tokenizer.from_pretrained("microsoft/speecht5_tts")
tokens = tokenizer.tokenize("a = b")
self.assertEqual(tokens, ["▁", "a", "▁", "=", "▁", "b"])
# the `'='` is unknown.
ids = tokenizer.convert_tokens_to_ids(tokens)
self.assertEqual(ids, [4, 7, 4, 3, 4, 25])
# let's make sure decoding with the special unknown tokens preserves spaces
ids = tokenizer.encode("a = b")
self.assertEqual(tokenizer.decode(ids), "a <unk> b</s>")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment