Unverified Commit 15d19ecf authored by Philip May's avatar Philip May Committed by GitHub
Browse files

fix convert_tokens_to_string calls (#11716)

parent c3d9ac76
...@@ -238,8 +238,7 @@ class AlbertTokenizer(PreTrainedTokenizer): ...@@ -238,8 +238,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
return self.sp_model.IdToPiece(index) return self.sp_model.IdToPiece(index)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return self.sp_model.decode(tokens)
return out_string
def build_inputs_with_special_tokens( def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
......
...@@ -271,8 +271,7 @@ class BarthezTokenizer(PreTrainedTokenizer): ...@@ -271,8 +271,7 @@ class BarthezTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return self.sp_model.decode(tokens)
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -271,8 +271,7 @@ class CamembertTokenizer(PreTrainedTokenizer): ...@@ -271,8 +271,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return self.sp_model.decode(tokens)
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -202,8 +202,7 @@ class M2M100Tokenizer(PreTrainedTokenizer): ...@@ -202,8 +202,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens: List[str]) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return self.sp_model.decode(tokens)
return out_string
def get_special_tokens_mask( def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
......
...@@ -228,8 +228,7 @@ class MBart50Tokenizer(PreTrainedTokenizer): ...@@ -228,8 +228,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens: List[str]) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() return self.sp_model.decode(tokens)
return out_string
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory): if not os.path.isdir(save_directory):
......
...@@ -185,7 +185,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer): ...@@ -185,7 +185,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
def convert_tokens_to_string(self, tokens: List[str]) -> str: def convert_tokens_to_string(self, tokens: List[str]) -> str:
"""Converts a sequence of tokens (strings for sub-words) in a single string.""" """Converts a sequence of tokens (strings for sub-words) in a single string."""
out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() out_string = self.sp_model.decode(tokens)
if self.do_upper_case: if self.do_upper_case:
out_string = out_string.upper() out_string = out_string.upper()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment