Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a96197f5
Unverified
Commit
a96197f5
authored
Feb 07, 2026
by
Nick Hill
Committed by
GitHub
Feb 08, 2026
Browse files
[Perf] Simplify DeepseekV32 tokenizer, ensure fast detokenization used (#33855)
Signed-off-by:
Nick Hill
<
nickhill123@gmail.com
>
parent
ab10d798
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
88 additions
and
203 deletions
+88
-203
tests/tokenizers_/test_basic.py
tests/tokenizers_/test_basic.py
+8
-0
vllm/renderers/deepseek_v32.py
vllm/renderers/deepseek_v32.py
+3
-2
vllm/tokenizers/deepseek_v32.py
vllm/tokenizers/deepseek_v32.py
+77
-179
vllm/v1/structured_output/backend_xgrammar.py
vllm/v1/structured_output/backend_xgrammar.py
+0
-22
No files found.
tests/tokenizers_/test_basic.py
View file @
a96197f5
...
...
@@ -11,6 +11,7 @@ from transformers import (
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.grok2
import
Grok2Tokenizer
from
vllm.tokenizers.hf
import
HfTokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
...
...
@@ -42,6 +43,13 @@ def test_tokenizer_like_protocol():
assert
isinstance
(
tokenizer
,
Grok2Tokenizer
)
_assert_tokenizer_like
(
tokenizer
)
tokenizer
=
get_tokenizer
(
"deepseek-ai/DeepSeek-V3"
,
tokenizer_mode
=
"deepseek_v32"
)
assert
isinstance
(
tokenizer
,
HfTokenizer
)
# Verify it's a fast tokenizer (required for FastIncrementalDetokenizer)
assert
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
)
assert
"DSV32"
in
tokenizer
.
__class__
.
__name__
_assert_tokenizer_like
(
tokenizer
)
@
pytest
.
mark
.
parametrize
(
"tokenizer_name"
,
[
"facebook/opt-125m"
,
"gpt2"
])
def
test_tokenizer_revision
(
tokenizer_name
:
str
):
...
...
vllm/renderers/deepseek_v32.py
View file @
a96197f5
...
...
@@ -13,6 +13,7 @@ from vllm.logger import init_logger
from
vllm.tokenizers
import
cached_get_tokenizer
from
vllm.tokenizers.deepseek_v32
import
DeepseekV32Tokenizer
from
..tokenizers.hf
import
HfTokenizer
from
.inputs
import
DictPrompt
from
.inputs.preprocess
import
parse_dec_only_prompt
from
.params
import
ChatParams
...
...
@@ -48,10 +49,10 @@ class DeepseekV32Renderer(BaseRenderer):
self
.
_tokenizer
=
tokenizer
@
property
def
tokenizer
(
self
)
->
DeepseekV32
Tokenizer
|
None
:
def
tokenizer
(
self
)
->
Hf
Tokenizer
|
None
:
return
self
.
_tokenizer
def
get_tokenizer
(
self
)
->
DeepseekV32
Tokenizer
:
def
get_tokenizer
(
self
)
->
Hf
Tokenizer
:
tokenizer
=
self
.
tokenizer
if
tokenizer
is
None
:
raise
ValueError
(
"Tokenizer not available when `skip_tokenizer_init=True`"
)
...
...
vllm/tokenizers/deepseek_v32.py
View file @
a96197f5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
copy
from
typing
import
Any
from
pathlib
import
Path
from
typing
import
Any
,
overload
from
transformers
import
BatchEncoding
from
transformers
import
AutoTokenizer
from
vllm.entrypoints.chat_utils
import
ChatCompletionMessageParam
from
.
import
TokenizerLike
from
.deepseek_v32_encoding
import
encode_messages
from
.hf
import
CachedHfTokenizer
from
.protocol
import
TokenizerLike
from
.hf
import
HfTokenizer
,
get_cached_tokenizer
class
DeepseekV32Tokenizer
(
CachedHfTokenizer
):
@
classmethod
def
from_pretrained
(
cls
,
path_or_repo_id
:
str
|
Path
,
*
args
,
trust_remote_code
:
bool
=
False
,
revision
:
str
|
None
=
None
,
download_dir
:
str
|
None
=
None
,
**
kwargs
,
)
->
"TokenizerLike"
:
tokenizer
=
super
().
from_pretrained
(
path_or_repo_id
,
*
args
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
download_dir
=
download_dir
,
def
get_deepseek_v32_tokenizer
(
tokenizer
:
HfTokenizer
)
->
HfTokenizer
:
"""
Wraps a tokenizer to use the custom DeepSeek V3.2 chat template encoding.
"""
dsv32_tokenizer
=
copy
.
copy
(
tokenizer
)
added_vocab
=
tokenizer
.
get_added_vocab
()
added_vocab_size
=
len
(
added_vocab
)
tokenizer_vocab_size
=
tokenizer
.
vocab_size
class
_DeepseekV32Tokenizer
(
tokenizer
.
__class__
):
# type: ignore
def
apply_chat_template
(
self
,
messages
:
list
[
"ChatCompletionMessageParam"
],
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
**
kwargs
,
)
return
DeepseekV32Tokenizer
(
tokenizer
)
def
__init__
(
self
,
tokenizer
:
TokenizerLike
)
->
None
:
super
().
__init__
()
self
.
tokenizer
=
tokenizer
self
.
name_or_path
=
getattr
(
tokenizer
,
"name_or_path"
,
""
)
self
.
_added_vocab
=
self
.
tokenizer
.
get_added_vocab
()
self
.
_added_vocab_size
=
len
(
self
.
_added_vocab
)
def
apply_chat_template
(
self
,
messages
:
list
[
"ChatCompletionMessageParam"
],
tools
:
list
[
dict
[
str
,
Any
]]
|
None
=
None
,
**
kwargs
,
)
->
str
|
list
[
int
]:
thinking
=
kwargs
.
get
(
"thinking"
,
False
)
enable_thinking
=
kwargs
.
get
(
"enable_thinking"
,
False
)
thinking
=
thinking
or
enable_thinking
thinking_mode
=
"thinking"
if
not
thinking
:
thinking_mode
=
"chat"
conversation
=
kwargs
.
get
(
"conversation"
,
messages
)
messages
=
conversation
.
copy
()
if
tools
is
not
None
and
len
(
tools
)
>
0
:
messages
.
insert
(
0
,
{
"role"
:
"system"
})
messages
[
0
][
"tools"
]
=
tools
# type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message is introduced
drop_thinking
=
messages
[
-
1
][
"role"
]
==
"user"
encode_config
=
dict
(
thinking_mode
=
thinking_mode
,
drop_thinking
=
drop_thinking
)
prompt_str
=
encode_messages
(
messages
,
**
encode_config
)
# type: ignore
if
kwargs
.
get
(
"tokenize"
,
True
):
tokenizer_kwargs
=
{
k
:
kwargs
[
k
]
for
k
in
(
"truncation"
,
"max_length"
)
if
k
in
kwargs
}
return
self
.
encode
(
prompt_str
,
add_special_tokens
=
False
,
**
tokenizer_kwargs
,
)
->
str
|
list
[
int
]:
thinking
=
kwargs
.
get
(
"thinking"
,
False
)
enable_thinking
=
kwargs
.
get
(
"enable_thinking"
,
False
)
thinking
=
thinking
or
enable_thinking
thinking_mode
=
"thinking"
if
not
thinking
:
thinking_mode
=
"chat"
conversation
=
kwargs
.
get
(
"conversation"
,
messages
)
messages
=
conversation
.
copy
()
if
tools
is
not
None
and
len
(
tools
)
>
0
:
messages
.
insert
(
0
,
{
"role"
:
"system"
})
messages
[
0
][
"tools"
]
=
tools
# type: ignore[typeddict-unknown-key]
# Historical reasoning content is dropped when a new user message
# is introduced
drop_thinking
=
messages
[
-
1
][
"role"
]
==
"user"
encode_config
=
dict
(
thinking_mode
=
thinking_mode
,
drop_thinking
=
drop_thinking
)
return
prompt_str
def
num_special_tokens_to_add
(
self
)
->
int
:
return
len
(
self
.
encode
(
""
))
@
property
def
all_special_tokens
(
self
)
->
list
[
str
]:
return
self
.
tokenizer
.
all_special_tokens
@
property
def
all_special_ids
(
self
)
->
list
[
int
]:
return
self
.
tokenizer
.
all_special_ids
@
property
def
bos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
bos_token_id
@
property
def
eos_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
eos_token_id
@
property
def
pad_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
pad_token_id
@
property
def
is_fast
(
self
)
->
bool
:
return
self
.
tokenizer
.
is_fast
@
property
def
vocab_size
(
self
)
->
int
:
return
self
.
tokenizer
.
vocab_size
@
property
def
max_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
max_token_id
@
property
def
max_chars_per_token
(
self
)
->
int
:
return
self
.
tokenizer
.
max_chars_per_token
@
property
def
truncation_side
(
self
)
->
str
:
return
self
.
tokenizer
.
truncation_side
def
__hash__
(
self
)
->
int
:
return
hash
(
id
(
self
))
def
__len__
(
self
)
->
int
:
# </think> is an added token in DeepseekV32 tokenizer
return
self
.
vocab_size
+
self
.
_added_vocab_size
def
__call__
(
self
,
text
:
str
|
list
[
str
],
text_pair
:
str
|
None
=
None
,
add_special_tokens
:
bool
=
True
,
truncation
:
bool
=
False
,
max_length
:
int
|
None
=
None
,
)
->
"BatchEncoding"
:
return
self
.
tokenizer
(
text
,
text_pair
=
text_pair
,
add_special_tokens
=
add_special_tokens
,
truncation
=
truncation
,
max_length
=
max_length
,
)
def
get_vocab
(
self
)
->
dict
[
str
,
int
]:
return
self
.
tokenizer
.
get_vocab
()
def
get_added_vocab
(
self
)
->
dict
[
str
,
int
]:
return
self
.
_added_vocab
.
copy
()
def
encode
(
self
,
text
:
str
,
truncation
:
bool
|
None
=
None
,
max_length
:
int
|
None
=
None
,
add_special_tokens
:
bool
=
True
,
)
->
list
[
int
]:
return
self
.
tokenizer
.
encode
(
text
,
truncation
=
truncation
,
max_length
=
max_length
,
add_special_tokens
=
add_special_tokens
,
)
@
overload
def
convert_tokens_to_ids
(
self
,
tokens
:
str
)
->
int
:
...
@
overload
def
convert_tokens_to_ids
(
self
,
tokens
:
list
[
str
])
->
list
[
int
]:
...
def
convert_tokens_to_ids
(
self
,
tokens
:
str
|
list
[
str
])
->
int
|
list
[
int
]:
return
self
.
tokenizer
.
convert_tokens_to_ids
(
tokens
)
def
convert_tokens_to_string
(
self
,
tokens
:
list
[
str
])
->
str
:
return
self
.
tokenizer
.
convert_tokens_to_string
(
tokens
)
def
decode
(
self
,
ids
:
list
[
int
]
|
int
,
skip_special_tokens
:
bool
=
False
)
->
str
:
return
self
.
tokenizer
.
decode
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
def
convert_ids_to_tokens
(
self
,
ids
:
list
[
int
],
skip_special_tokens
:
bool
=
False
,
)
->
list
[
str
]:
return
self
.
tokenizer
.
convert_ids_to_tokens
(
ids
,
skip_special_tokens
=
skip_special_tokens
)
prompt_str
=
encode_messages
(
messages
,
**
encode_config
)
# type: ignore
if
kwargs
.
get
(
"tokenize"
,
True
):
tokenizer_kwargs
=
{
k
:
kwargs
[
k
]
for
k
in
(
"truncation"
,
"max_length"
)
if
k
in
kwargs
}
return
self
.
encode
(
prompt_str
,
add_special_tokens
=
False
,
**
tokenizer_kwargs
,
)
return
prompt_str
def
num_special_tokens_to_add
(
self
)
->
int
:
return
len
(
self
.
encode
(
""
))
def
__len__
(
self
)
->
int
:
# </think> is an added token in DeepseekV32 tokenizer
return
tokenizer_vocab_size
+
added_vocab_size
def
get_added_vocab
(
self
)
->
dict
[
str
,
int
]:
return
added_vocab
.
copy
()
def
__reduce__
(
self
):
return
get_deepseek_v32_tokenizer
,
(
tokenizer
,)
_DeepseekV32Tokenizer
.
__name__
=
f
"DSV32
{
tokenizer
.
__class__
.
__name__
}
"
dsv32_tokenizer
.
__class__
=
_DeepseekV32Tokenizer
return
dsv32_tokenizer
class
DeepseekV32Tokenizer
(
TokenizerLike
):
@
classmethod
def
from_pretrained
(
cls
,
*
args
,
**
kwargs
)
->
HfTokenizer
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
*
args
,
**
kwargs
)
return
get_cached_tokenizer
(
get_deepseek_v32_tokenizer
(
tokenizer
))
vllm/v1/structured_output/backend_xgrammar.py
View file @
a96197f5
...
...
@@ -10,7 +10,6 @@ import torch
import
vllm.envs
from
vllm.logger
import
init_logger
from
vllm.sampling_params
import
SamplingParams
from
vllm.tokenizers.deepseek_v32
import
DeepseekV32Tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.utils.import_utils
import
LazyLoader
from
vllm.v1.structured_output.backend_types
import
(
...
...
@@ -57,27 +56,6 @@ class XgrammarBackend(StructuredOutputBackend):
stop_token_ids
=
stop_token_ids
,
add_prefix_space
=
True
,
)
elif
isinstance
(
self
.
tokenizer
,
DeepseekV32Tokenizer
):
# copy from xgr.TokenizerInfo.from_huggingface()
# because we are using a custom tokenizer wrapper here.
vocab_dict
=
self
.
tokenizer
.
get_vocab
()
tokenizer_vocab_size
=
max
(
len
(
vocab_dict
),
self
.
tokenizer
.
max_token_id
+
1
)
vocab_size
=
self
.
vocab_size
or
tokenizer_vocab_size
# maintain tokenizer's indexing
encoded_vocab
=
[
""
]
*
vocab_size
for
token
,
idx
in
vocab_dict
.
items
():
if
idx
<
vocab_size
:
encoded_vocab
[
idx
]
=
token
stop_token_ids
=
[
self
.
tokenizer
.
eos_token_id
]
backend_str
=
self
.
tokenizer
.
tokenizer
.
backend_tokenizer
.
to_str
()
# type: ignore[attr-defined]
metadata
=
xgr
.
TokenizerInfo
.
_detect_metadata_from_hf
(
backend_str
)
tokenizer_info
=
xgr
.
TokenizerInfo
(
encoded_vocab
=
encoded_vocab
,
vocab_type
=
metadata
[
"vocab_type"
],
vocab_size
=
vocab_size
,
stop_token_ids
=
stop_token_ids
,
add_prefix_space
=
metadata
[
"add_prefix_space"
],
)
else
:
tokenizer_info
=
xgr
.
TokenizerInfo
.
from_huggingface
(
self
.
tokenizer
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment