Unverified Commit aa4198a2 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`T5Tokenizer`] Fix fast and extra tokens (#27085)

* v4.35.dev.0

* nit t5fast match t5 slow
parent 6f316016
......@@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
**kwargs,
):
# Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and additional_special_tokens is not None:
# Check that we have the right number of extra special tokens
extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens)))
if extra_tokens != extra_ids:
if additional_special_tokens is not None:
extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
if len(extra_tokens) < 1:
additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
elif extra_ids > 0 and extra_ids != len(extra_tokens):
raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
else:
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
additional_special_tokens = extra_tokens
super().__init__(
vocab_file,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment