[`T5Tokenizer`] Fix fast and extra tokens (#27085)

* v4.35.dev.0 * nit t5fast match t5 slow

[`T5Tokenizer`] Fix fast and extra tokens (#27085)
* v4.35.dev.0 * nit t5fast match t5 slow
aa4198a2 · Arthur · GitHub · 6f316016 · aa4198a2
Unverified Commit aa4198a2 authored Oct 27, 2023 by Arthur Committed by GitHub Oct 27, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

src/transformers/models/t5/tokenization_t5_fast.py src/transformers/models/t5/tokenization_t5_fast.py +8 -6

No files found.
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
        **kwargs,
    ):
        # Add extra_ids to the special token list
-        if extra_ids > 0 and additional_special_tokens is None:
-            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
-        elif extra_ids > 0 and additional_special_tokens is not None:
-            # Check that we have the right number of extra special tokens
-            extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens)))
-            if extra_tokens != extra_ids:
+        if additional_special_tokens is not None:
+            extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
+            if len(extra_tokens) < 1:
+                additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
+            elif extra_ids > 0 and extra_ids != len(extra_tokens):
                raise ValueError(
                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                    " tokens"
                )
+        else:
+            extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+            additional_special_tokens = extra_tokens

        super().__init__(
            vocab_file,