"vscode:/vscode.git/clone" did not exist on "06a6cb6f360e88866afdac5c0c4e295ab7da2c9b"
Unverified Commit aa4198a2 authored by Arthur's avatar Arthur Committed by GitHub
Browse files

[`T5Tokenizer`] Fix fast and extra tokens (#27085)

* v4.35.dev.0

* nit t5fast match t5 slow
parent 6f316016
...@@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast): ...@@ -118,17 +118,19 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
**kwargs, **kwargs,
): ):
# Add extra_ids to the special token list # Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None: if additional_special_tokens is not None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)] extra_tokens = [x for x in additional_special_tokens if "<extra_id_" in str(x)]
elif extra_ids > 0 and additional_special_tokens is not None: if len(extra_tokens) < 1:
# Check that we have the right number of extra special tokens additional_special_tokens += [f"<extra_id_{i}>" for i in range(extra_ids)]
extra_tokens = len(set(filter(lambda x: bool("extra_id_" in str(x)), additional_special_tokens))) elif extra_ids > 0 and extra_ids != len(extra_tokens):
if extra_tokens != extra_ids:
raise ValueError( raise ValueError(
f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids" " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens" " tokens"
) )
else:
extra_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
additional_special_tokens = extra_tokens
super().__init__( super().__init__(
vocab_file, vocab_file,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment