Unverified Commit a96197f5 authored by Nick Hill's avatar Nick Hill Committed by GitHub
Browse files

[Perf] Simplify DeepseekV32 tokenizer, ensure fast detokenization used (#33855)


Signed-off-by: default avatarNick Hill <nickhill123@gmail.com>
parent ab10d798
......@@ -11,6 +11,7 @@ from transformers import (
from vllm.tokenizers import TokenizerLike, get_tokenizer
from vllm.tokenizers.grok2 import Grok2Tokenizer
from vllm.tokenizers.hf import HfTokenizer
from vllm.tokenizers.mistral import MistralTokenizer
......@@ -42,6 +43,13 @@ def test_tokenizer_like_protocol():
assert isinstance(tokenizer, Grok2Tokenizer)
_assert_tokenizer_like(tokenizer)
tokenizer = get_tokenizer("deepseek-ai/DeepSeek-V3", tokenizer_mode="deepseek_v32")
assert isinstance(tokenizer, HfTokenizer)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert isinstance(tokenizer, PreTrainedTokenizerFast)
assert "DSV32" in tokenizer.__class__.__name__
_assert_tokenizer_like(tokenizer)
@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
def test_tokenizer_revision(tokenizer_name: str):
......
......@@ -13,6 +13,7 @@ from vllm.logger import init_logger
from vllm.tokenizers import cached_get_tokenizer
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from ..tokenizers.hf import HfTokenizer
from .inputs import DictPrompt
from .inputs.preprocess import parse_dec_only_prompt
from .params import ChatParams
......@@ -48,10 +49,10 @@ class DeepseekV32Renderer(BaseRenderer):
self._tokenizer = tokenizer
@property
def tokenizer(self) -> DeepseekV32Tokenizer | None:
def tokenizer(self) -> HfTokenizer | None:
return self._tokenizer
def get_tokenizer(self) -> DeepseekV32Tokenizer:
def get_tokenizer(self) -> HfTokenizer:
tokenizer = self.tokenizer
if tokenizer is None:
raise ValueError("Tokenizer not available when `skip_tokenizer_init=True`")
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from typing import Any
from pathlib import Path
from typing import Any, overload
from transformers import BatchEncoding
from transformers import AutoTokenizer
from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
from . import TokenizerLike
from .deepseek_v32_encoding import encode_messages
from .hf import CachedHfTokenizer
from .protocol import TokenizerLike
class DeepseekV32Tokenizer(CachedHfTokenizer):
@classmethod
def from_pretrained(
cls,
path_or_repo_id: str | Path,
*args,
trust_remote_code: bool = False,
revision: str | None = None,
download_dir: str | None = None,
**kwargs,
) -> "TokenizerLike":
tokenizer = super().from_pretrained(
path_or_repo_id,
*args,
trust_remote_code=trust_remote_code,
revision=revision,
download_dir=download_dir,
**kwargs,
)
return DeepseekV32Tokenizer(tokenizer)
from .hf import HfTokenizer, get_cached_tokenizer
def __init__(self, tokenizer: TokenizerLike) -> None:
super().__init__()
self.tokenizer = tokenizer
self.name_or_path = getattr(tokenizer, "name_or_path", "")
def get_deepseek_v32_tokenizer(tokenizer: HfTokenizer) -> HfTokenizer:
"""
Wraps a tokenizer to use the custom DeepSeek V3.2 chat template encoding.
"""
dsv32_tokenizer = copy.copy(tokenizer)
self._added_vocab = self.tokenizer.get_added_vocab()
self._added_vocab_size = len(self._added_vocab)
added_vocab = tokenizer.get_added_vocab()
added_vocab_size = len(added_vocab)
tokenizer_vocab_size = tokenizer.vocab_size
class _DeepseekV32Tokenizer(tokenizer.__class__): # type: ignore
def apply_chat_template(
self,
messages: list["ChatCompletionMessageParam"],
......@@ -61,10 +41,13 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
messages.insert(0, {"role": "system"})
messages[0]["tools"] = tools # type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message is introduced
# Historical reasoning content is dropped when a new user message
# is introduced
drop_thinking = messages[-1]["role"] == "user"
encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking)
encode_config = dict(
thinking_mode=thinking_mode, drop_thinking=drop_thinking
)
prompt_str = encode_messages(messages, **encode_config) # type: ignore
......@@ -83,109 +66,24 @@ class DeepseekV32Tokenizer(CachedHfTokenizer):
def num_special_tokens_to_add(self) -> int:
return len(self.encode(""))
@property
def all_special_tokens(self) -> list[str]:
return self.tokenizer.all_special_tokens
@property
def all_special_ids(self) -> list[int]:
return self.tokenizer.all_special_ids
@property
def bos_token_id(self) -> int:
return self.tokenizer.bos_token_id
@property
def eos_token_id(self) -> int:
return self.tokenizer.eos_token_id
@property
def pad_token_id(self) -> int:
return self.tokenizer.pad_token_id
@property
def is_fast(self) -> bool:
return self.tokenizer.is_fast
@property
def vocab_size(self) -> int:
return self.tokenizer.vocab_size
@property
def max_token_id(self) -> int:
return self.tokenizer.max_token_id
@property
def max_chars_per_token(self) -> int:
return self.tokenizer.max_chars_per_token
@property
def truncation_side(self) -> str:
return self.tokenizer.truncation_side
def __hash__(self) -> int:
return hash(id(self))
def __len__(self) -> int:
# </think> is an added token in DeepseekV32 tokenizer
return self.vocab_size + self._added_vocab_size
def __call__(
self,
text: str | list[str],
text_pair: str | None = None,
add_special_tokens: bool = True,
truncation: bool = False,
max_length: int | None = None,
) -> "BatchEncoding":
return self.tokenizer(
text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
truncation=truncation,
max_length=max_length,
)
def get_vocab(self) -> dict[str, int]:
return self.tokenizer.get_vocab()
return tokenizer_vocab_size + added_vocab_size
def get_added_vocab(self) -> dict[str, int]:
return self._added_vocab.copy()
return added_vocab.copy()
def encode(
self,
text: str,
truncation: bool | None = None,
max_length: int | None = None,
add_special_tokens: bool = True,
) -> list[int]:
return self.tokenizer.encode(
text,
truncation=truncation,
max_length=max_length,
add_special_tokens=add_special_tokens,
)
@overload
def convert_tokens_to_ids(self, tokens: str) -> int: ...
@overload
def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ...
def __reduce__(self):
return get_deepseek_v32_tokenizer, (tokenizer,)
def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
return self.tokenizer.convert_tokens_to_ids(tokens)
_DeepseekV32Tokenizer.__name__ = f"DSV32{tokenizer.__class__.__name__}"
def convert_tokens_to_string(self, tokens: list[str]) -> str:
return self.tokenizer.convert_tokens_to_string(tokens)
dsv32_tokenizer.__class__ = _DeepseekV32Tokenizer
return dsv32_tokenizer
def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
return self.tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def convert_ids_to_tokens(
self,
ids: list[int],
skip_special_tokens: bool = False,
) -> list[str]:
return self.tokenizer.convert_ids_to_tokens(
ids, skip_special_tokens=skip_special_tokens
)
class DeepseekV32Tokenizer(TokenizerLike):
@classmethod
def from_pretrained(cls, *args, **kwargs) -> HfTokenizer:
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
return get_cached_tokenizer(get_deepseek_v32_tokenizer(tokenizer))
......@@ -10,7 +10,6 @@ import torch
import vllm.envs
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.tokenizers.deepseek_v32 import DeepseekV32Tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.import_utils import LazyLoader
from vllm.v1.structured_output.backend_types import (
......@@ -57,27 +56,6 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids=stop_token_ids,
add_prefix_space=True,
)
elif isinstance(self.tokenizer, DeepseekV32Tokenizer):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict = self.tokenizer.get_vocab()
tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1)
vocab_size = self.vocab_size or tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab = [""] * vocab_size
for token, idx in vocab_dict.items():
if idx < vocab_size:
encoded_vocab[idx] = token
stop_token_ids = [self.tokenizer.eos_token_id]
backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str() # type: ignore[attr-defined]
metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str)
tokenizer_info = xgr.TokenizerInfo(
encoded_vocab=encoded_vocab,
vocab_type=metadata["vocab_type"],
vocab_size=vocab_size,
stop_token_ids=stop_token_ids,
add_prefix_space=metadata["add_prefix_space"],
)
else:
tokenizer_info = xgr.TokenizerInfo.from_huggingface(
self.tokenizer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment