Unverified Commit a96197f5 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Perf] Simplify DeepseekV32 tokenizer, ensure fast detokenization used (#33855)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent ab10d798
...@@ -11,6 +11,7 @@ from transformers import ( ...@@ -11,6 +11,7 @@ from transformers import (
from vllm.tokenizers import TokenizerLike, get_tokenizer from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.grok2 import Grok2Tokenizer from vllm.tokenizers.grok2 import Grok2Tokenizer
from vllm.tokenizers.hf import HfTokenizer
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
...@@ -42,6 +43,13 @@ def test_tokenizer_like_protocol(): ...@@ -42,6 +43,13 @@ def test_tokenizer_like_protocol():
assert isinstance(tokenizer, Grok2Tokenizer) assert isinstance(tokenizer, Grok2Tokenizer)
_assert_tokenizer_like(tokenizer) _assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
assert isinstance(tokenizer, HfTokenizer)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
assert "DSV32" in tokenizer.__class__.__name__
_assert_tokenizer_like(tokenizer)
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"]) @pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
def test_tokenizer_revision(tokenizer_name: str): def test_tokenizer_revision(tokenizer_name: str):
......
...@@ -13,6 +13,7 @@ from vllm.logger import init_logger ...@@ -13,6 +13,7 @@ from vllm.logger import init_logger
from vllm.tokenizers import cached_get_tokenizer from vllm.tokenizers import cached_get_tokenizer
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from ..tokenizers.hf import HfTokenizer
from .inputs import DictPrompt from .inputs import DictPrompt
from .inputs.preprocess import parse_dec_only_prompt from .inputs.preprocess import parse_dec_only_prompt
from .params import ChatParams from .params import ChatParams
...@@ -48,10 +49,10 @@ class DeepseekV32Renderer(BaseRenderer): ...@@ -48,10 +49,10 @@ class DeepseekV32Renderer(BaseRenderer):
self._tokenizer = tokenizer self._tokenizer = tokenizer
@property @property
def tokenizer(self) -> DeepseekV32Tokenizer | None: def tokenizer(self) -> HfTokenizer | None:
return self._tokenizer return self._tokenizer
def get_tokenizer(self) -> DeepseekV32Tokenizer: def get_tokenizer(self) -> HfTokenizer:
tokenizer = self.tokenizer tokenizer = self.tokenizer
if tokenizer is None: if tokenizer is None:
raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`") raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from typing import Any
from pathlib import Path from transformers import AutoTokenizer
from typing import Any, overload
from transformers import BatchEncoding
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from . import TokenizerLike
from .deepseek_v32_encoding import encode_messages from .deepseek_v32_encoding import encode_messages
from .hf import CachedHfTokenizer from .hf import HfTokenizer, get_cached_tokenizer
from .protocol import TokenizerLike
class DeepseekV32Tokenizer(CachedHfTokenizer): def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
@classmethod """
def from_pretrained( Wraps a tokenizer to use the custom DeepSeek V3.2 chat template encoding.
cls, """
path_or_repo_id: str | Path, dsv32_tokenizer = copy.copy(tokenizer)
*args,
trust_remote_code: bool = False, added_vocab = tokenizer.get_added_vocab()
revision: str | None = None, added_vocab_size = len(added_vocab)
download_dir: str | None = None, tokenizer_vocab_size = tokenizer.vocab_size
**kwargs,
) -> "TokenizerLike": class _DeepseekV32Tokenizer(tokenizer.__class__): # type: ignore
tokenizer = super().from_pretrained( def apply_chat_template(
path_or_repo_id, self,
*args, messages: list["ChatCompletionMessageParam"],
trust_remote_code=trust_remote_code, tools: list[dict[str, Any]] | None = None,
revision=revision,
download_dir=download_dir,
**kwargs, **kwargs,
) ) -> str | list[int]:
return DeepseekV32Tokenizer(tokenizer) thinking = kwargs.get("thinking", False)
enable_thinking = kwargs.get("enable_thinking", False)
def __init__(self, tokenizer: TokenizerLike) -> None: thinking = thinking or enable_thinking
super().__init__() thinking_mode = "thinking"
if not thinking:
self.tokenizer = tokenizer thinking_mode = "chat"
self.name_or_path = getattr(tokenizer, "name_or_path", "") conversation = kwargs.get("conversation", messages)
messages = conversation.copy()
self._added_vocab = self.tokenizer.get_added_vocab() if tools is not None and len(tools) > 0:
self._added_vocab_size = len(self._added_vocab) messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
def apply_chat_template(
self, # Historical reasoning content is dropped when a new user message
messages: list["ChatCompletionMessageParam"], # is introduced
tools: list[dict[str, Any]] | None = None, drop_thinking = messages[-1]["role"] == "user"
**kwargs,
) -> str | list[int]: encode_config = dict(
thinking = kwargs.get("thinking", False) thinking_mode=thinking_mode, drop_thinking=drop_thinking
enable_thinking = kwargs.get("enable_thinking", False)
thinking = thinking or enable_thinking
thinking_mode = "thinking"
if not thinking:
thinking_mode = "chat"
conversation = kwargs.get("conversation", messages)
messages = conversation.copy()
if tools is not None and len(tools) > 0:
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message is introduced
drop_thinking = messages[-1]["role"] == "user"
encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
prompt_str = encode_messages(messages, **encode_config) # type: ignore
if kwargs.get("tokenize", True):
tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
}
return self.encode(
prompt_str,
add_special_tokens=False,
**tokenizer_kwargs,
) )
return prompt_str prompt_str = encode_messages(messages, **encode_config) # type: ignore
def num_special_tokens_to_add(self) -> int: if kwargs.get("tokenize", True):
return len(self.encode("")) tokenizer_kwargs = {
k: kwargs[k] for k in ("truncation", "max_length") if k in kwargs
@property }
def all_special_tokens(self) -> list[str]: return self.encode(
return self.tokenizer.all_special_tokens prompt_str,
add_special_tokens=False,
@property **tokenizer_kwargs,
def all_special_ids(self) -> list[int]: )
return self.tokenizer.all_special_ids
return prompt_str
@property
def bos_token_id(self) -> int: def num_special_tokens_to_add(self) -> int:
return self.tokenizer.bos_token_id return len(self.encode(""))
@property def __len__(self) -> int:
def eos_token_id(self) -> int: # </think> is an added token in DeepseekV32 tokenizer
return self.tokenizer.eos_token_id return tokenizer_vocab_size + added_vocab_size
@property def get_added_vocab(self) -> dict[str, int]:
def pad_token_id(self) -> int: return added_vocab.copy()
return self.tokenizer.pad_token_id
def __reduce__(self):
@property return get_deepseek_v32_tokenizer, (tokenizer,)
def is_fast(self) -> bool:
return self.tokenizer.is_fast _DeepseekV32Tokenizer.__name__ = f"DSV32{tokenizer.__class__.__name__}"
@property dsv32_tokenizer.__class__ = _DeepseekV32Tokenizer
def vocab_size(self) -> int: return dsv32_tokenizer
return self.tokenizer.vocab_size
@property class DeepseekV32Tokenizer(TokenizerLike):
def max_token_id(self) -> int: @classmethod
return self.tokenizer.max_token_id def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
@property return get_cached_tokenizer(get_deepseek_v32_tokenizer(tokenizer))
def max_chars_per_token(self) -> int:
return self.tokenizer.max_chars_per_token
@property
def truncation_side(self) -> str:
return self.tokenizer.truncation_side
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + self._added_vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
return self.tokenizer(
text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
def get_vocab(self) -> dict[str, int]:
return self.tokenizer.get_vocab()
def get_added_vocab(self) -> dict[str, int]:
return self._added_vocab.copy()
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
return self.tokenizer.encode(
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
@overload
def convert_tokens_to_ids(self, tokens: str) -> int: ...
@overload
def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
return self.tokenizer.convert_tokens_to_ids(tokens)
def convert_tokens_to_string(self, tokens: list[str]) -> str:
return self.tokenizer.convert_tokens_to_string(tokens)
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
return self.tokenizer.convert_ids_to_tokens(
ids, skip_special_tokens=skip_special_tokens
)
...@@ -10,7 +10,6 @@ import torch ...@@ -10,7 +10,6 @@ import torch
import vllm.envs import vllm.envs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.import_utils import LazyLoader from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import ( from vllm.v1.structured_output.backend_types import (
...@@ -57,27 +56,6 @@ class XgrammarBackend(StructuredOutputBackend): ...@@ -57,27 +56,6 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
add_prefix_space=True, add_prefix_space=True,
) )
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict = self.tokenizer.get_vocab()
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
vocab_size = self.vocab_size or tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab = [""] * vocab_size
for token, idx in vocab_dict.items():
if idx < vocab_size:
encoded_vocab[idx] = token
stop_token_ids = [self.tokenizer.eos_token_id]
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str() # type: ignore[attr-defined]
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
tokenizer_info = xgr.TokenizerInfo(
encoded_vocab=encoded_vocab,
vocab_type=metadata["vocab_type"],
vocab_size=vocab_size,
stop_token_ids=stop_token_ids,
add_prefix_space=metadata["add_prefix_space"],
)
else: else:
tokenizer_info = xgr.TokenizerInfo.from_huggingface( tokenizer_info = xgr.TokenizerInfo.from_huggingface(
self.tokenizer, self.tokenizer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment