Tokenizer.batch_decode convenience method (#4159)

9535bf19 · Sam Shleifer · GitHub · 7822cd38 · 9535bf19 · 9535bf19
Unverified Commit 9535bf19 authored May 14, 2020 by Sam Shleifer Committed by GitHub May 14, 2020
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

src/transformers/tokenization_marian.py src/transformers/tokenization_marian.py +0 -3

src/transformers/tokenization_utils.py src/transformers/tokenization_utils.py +3 -0

No files found.
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@@ -124,9 +124,6 @@ class MarianTokenizer(PreTrainedTokenizer):
        # We don't expect to process pairs, but leave the pair logic for API consistency
        return token_ids_0 + token_ids_1 + [self.eos_token_id]

-    def batch_decode(self, token_ids, **kwargs) -> List[str]:
-        return [self.decode(ids, **kwargs) for ids in token_ids]
-
    def prepare_translation_batch(
        self,
        src_texts: List[str],

--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -2183,6 +2183,9 @@ class PreTrainedTokenizer(SpecialTokensMixin):
        else:
            return text

+    def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]:
+        return [self.decode(seq, **kwargs) for seq in sequences]
+
    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.