Unverified Commit fecae12c authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Remove `all_special_tokens_extended` from tokenizer code (#29686)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 8d9338fa
......@@ -31,7 +31,6 @@ def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
# Cached attributes
assert target.all_special_ids == expected.all_special_ids
assert target.all_special_tokens == expected.all_special_tokens
assert target.all_special_tokens_extended == expected.all_special_tokens_extended
assert target.get_vocab() == expected.get_vocab()
assert len(target) == len(expected)
......
......@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
)
class TestMistralTokenizer:
def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
attributes = [
mistral_tokenizer.all_special_tokens,
mistral_tokenizer.all_special_tokens_extended,
]
for attribute in attributes:
if mistral_tokenizer.is_tekken:
assert attribute == [
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
"[ARGS]",
"[CALL_ID]",
"[THINK]",
"[/THINK]",
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
else:
assert attribute == [
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[TOOL_CALLS]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
] + [f"[control_{i}]" for i in range(8, 769)]
if mistral_tokenizer.is_tekken:
assert mistral_tokenizer.all_special_tokens == [
"<unk>",
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
"[TOOL_CALLS]",
"[IMG]",
"<pad>",
"[IMG_BREAK]",
"[IMG_END]",
"[PREFIX]",
"[MIDDLE]",
"[SUFFIX]",
"[SYSTEM_PROMPT]",
"[/SYSTEM_PROMPT]",
"[TOOL_CONTENT]",
] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
"[ARGS]",
"[CALL_ID]",
"[THINK]",
"[/THINK]",
] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
else:
assert mistral_tokenizer.all_special_tokens == [
"<s>",
"</s>",
"[INST]",
"[/INST]",
"[TOOL_CALLS]",
"[AVAILABLE_TOOLS]",
"[/AVAILABLE_TOOLS]",
"[TOOL_RESULTS]",
"[/TOOL_RESULTS]",
] + [f"[control_{i}]" for i in range(8, 769)]
def get_vocab(self, mistral_tokenizer: MistralTokenizer):
assert (
......
......@@ -15,10 +15,6 @@ class TestTokenizer(TokenizerBase):
def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
return TestTokenizer()
@property
def all_special_tokens_extended(self) -> list[str]:
raise NotImplementedError()
@property
def all_special_tokens(self) -> list[str]:
raise NotImplementedError()
......
......@@ -96,7 +96,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
tokenizer_all_special_ids = tokenizer.all_special_ids
tokenizer_all_special_tokens = tokenizer.all_special_tokens
tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
tokenizer_vocab = tokenizer.get_vocab()
tokenizer_len = len(tokenizer)
......@@ -118,10 +117,6 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
def all_special_tokens(self) -> list[str]:
return tokenizer_all_special_tokens
@property
def all_special_tokens_extended(self) -> list[str]:
return tokenizer_all_special_tokens_extended
@property
def max_token_id(self) -> int:
return max_token_id
......
......@@ -10,11 +10,6 @@ if TYPE_CHECKING:
class TokenizerBase(ABC):
@property
@abstractmethod
def all_special_tokens_extended(self) -> list[str]:
raise NotImplementedError()
@property
@abstractmethod
def all_special_tokens(self) -> list[str]:
......
......@@ -254,10 +254,6 @@ class MistralTokenizer(TokenizerBase):
# the following attributes are set to fit vLLM's design and are used
# by the structured output backends.
@property
def all_special_tokens_extended(self) -> list[str]:
return self.all_special_tokens
@property
def all_special_tokens(self) -> list[str]:
return self._special_tokens
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment