Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f0a28bf6
Unverified
Commit
f0a28bf6
authored
Dec 01, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 01, 2025
Browse files
[Misc] Unify tokenizer registration (#29767)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
86e178f7
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
237 additions
and
183 deletions
+237
-183
tests/entrypoints/openai/test_tokenization.py
tests/entrypoints/openai/test_tokenization.py
+5
-5
tests/entrypoints/pooling/embed/test_online.py
tests/entrypoints/pooling/embed/test_online.py
+1
-1
tests/entrypoints/pooling/pooling/test_online.py
tests/entrypoints/pooling/pooling/test_online.py
+1
-5
tests/models/registry.py
tests/models/registry.py
+1
-1
tests/tokenizers_/test_registry.py
tests/tokenizers_/test_registry.py
+21
-4
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+2
-7
vllm/config/model.py
vllm/config/model.py
+7
-15
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+1
-1
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+1
-1
vllm/tokenizers/__init__.py
vllm/tokenizers/__init__.py
+8
-2
vllm/tokenizers/hf.py
vllm/tokenizers/hf.py
+2
-0
vllm/tokenizers/mistral.py
vllm/tokenizers/mistral.py
+2
-0
vllm/tokenizers/registry.py
vllm/tokenizers/registry.py
+184
-15
vllm/transformers_utils/tokenizer.py
vllm/transformers_utils/tokenizer.py
+1
-126
No files found.
tests/entrypoints/openai/test_tokenization.py
View file @
f0a28bf6
...
...
@@ -53,7 +53,7 @@ async def test_tokenize_completions(
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
)
for
add_special
in
[
False
,
True
]:
prompt
=
"vllm1 This is a test prompt."
...
...
@@ -87,7 +87,7 @@ async def test_tokenize_chat(
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
)
for
add_generation
in
[
False
,
True
]:
for
add_special
in
[
False
,
True
]:
...
...
@@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
)
for
add_generation
in
[
False
,
True
]:
for
add_special
in
[
False
,
True
]:
...
...
@@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
)
prompt
=
"This is a token_strs test prompt! vllm1"
response
=
requests
.
post
(
...
...
@@ -240,7 +240,7 @@ async def test_detokenize(
model_name
:
str
,
tokenizer_name
:
str
,
):
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
tokenizer_name
)
prompt
=
"This is a test prompt. vllm1"
tokens
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)
...
...
tests/entrypoints/pooling/embed/test_online.py
View file @
f0a28bf6
...
...
@@ -197,7 +197,7 @@ async def test_conversation_embedding(
chat_response
.
raise_for_status
()
chat_embeddings
=
EmbeddingResponse
.
model_validate
(
chat_response
.
json
())
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
tokenizer_mode
=
"fast"
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
)
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
chat_template
=
DUMMY_CHAT_TEMPLATE
,
...
...
tests/entrypoints/pooling/pooling/test_online.py
View file @
f0a28bf6
...
...
@@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
chat_response
.
raise_for_status
()
chat_poolings
=
PoolingResponse
.
model_validate
(
chat_response
.
json
())
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
tokenizer_mode
=
"fast"
,
trust_remote_code
=
True
,
)
tokenizer
=
get_tokenizer
(
tokenizer_name
=
model_name
,
trust_remote_code
=
True
)
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
chat_template
=
DUMMY_CHAT_TEMPLATE
,
...
...
tests/models/registry.py
View file @
f0a28bf6
...
...
@@ -23,7 +23,7 @@ class _HfExamplesInfo:
tokenizer
:
str
|
None
=
None
"""Set the tokenizer to load for this architecture."""
tokenizer_mode
:
TokenizerMode
=
"auto"
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
"""Set the tokenizer type for this architecture."""
speculative_model
:
str
|
None
=
None
...
...
tests/tokenizers_/test_registry.py
View file @
f0a28bf6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
pathlib
import
Path
from
vllm.tokenizers
import
TokenizerLike
,
TokenizerRegistry
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
class
TestTokenizer
(
TokenizerLike
):
@
classmethod
def
from_pretrained
(
cls
,
*
args
,
**
kwargs
)
->
"TestTokenizer"
:
return
TestTokenizer
()
# type: ignore
def
from_pretrained
(
cls
,
path_or_repo_id
:
str
|
Path
,
*
args
,
trust_remote_code
:
bool
=
False
,
revision
:
str
|
None
=
None
,
download_dir
:
str
|
None
=
None
,
**
kwargs
,
)
->
"TestTokenizer"
:
return
TestTokenizer
(
path_or_repo_id
)
# type: ignore
def
__init__
(
self
,
path_or_repo_id
:
str
|
Path
)
->
None
:
super
().
__init__
()
self
.
path_or_repo_id
=
path_or_repo_id
@
property
def
bos_token_id
(
self
)
->
int
:
...
...
@@ -29,14 +44,16 @@ class TestTokenizer(TokenizerLike):
def
test_customized_tokenizer
():
TokenizerRegistry
.
register
(
"test_tokenizer"
,
__name__
,
TestTokenizer
.
__name__
)
tokenizer
=
TokenizerRegistry
.
get_tokenizer
(
"test_tokenizer"
)
tokenizer
=
TokenizerRegistry
.
get_tokenizer
(
"test_tokenizer"
,
"abc"
)
assert
isinstance
(
tokenizer
,
TestTokenizer
)
assert
tokenizer
.
path_or_repo_id
==
"abc"
assert
tokenizer
.
bos_token_id
==
0
assert
tokenizer
.
eos_token_id
==
1
assert
tokenizer
.
pad_token_id
==
2
tokenizer
=
get_tokenizer
(
"
test_tokenizer
"
,
tokenizer_mode
=
"
custom
"
)
tokenizer
=
get_tokenizer
(
"
abc
"
,
tokenizer_mode
=
"
test_tokenizer
"
)
assert
isinstance
(
tokenizer
,
TestTokenizer
)
assert
tokenizer
.
path_or_repo_id
==
"abc"
assert
tokenizer
.
bos_token_id
==
0
assert
tokenizer
.
eos_token_id
==
1
assert
tokenizer
.
pad_token_id
==
2
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
f0a28bf6
...
...
@@ -4,7 +4,7 @@
import
json
from
enum
import
Enum
from
typing
import
TYPE_CHECKING
,
Any
from
typing
import
Any
import
jsonschema
import
pytest
...
...
@@ -24,11 +24,6 @@ from vllm.sampling_params import (
StructuredOutputsParams
,
)
if
TYPE_CHECKING
:
from
vllm.config.model
import
TokenizerMode
else
:
TokenizerMode
=
str
NGRAM_SPEC_CONFIG
=
{
"model"
:
"[ngram]"
,
"num_speculative_tokens"
:
5
,
...
...
@@ -627,7 +622,7 @@ Make the response as short as possible.
)
def
test_structured_output_with_reasoning_matrices
(
backend
:
str
,
tokenizer_mode
:
TokenizerMode
,
tokenizer_mode
:
str
,
reasoning_parser
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
...
...
vllm/config/model.py
View file @
f0a28bf6
...
...
@@ -86,7 +86,7 @@ TaskOption = Literal[
"transcription"
,
"draft"
,
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"custom"
]
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
]
ModelDType
=
Literal
[
"auto"
,
"half"
,
"float16"
,
"bfloat16"
,
"float"
,
"float32"
]
LogprobsMode
=
Literal
[
"raw_logits"
,
"raw_logprobs"
,
"processed_logits"
,
"processed_logprobs"
...
...
@@ -137,13 +137,13 @@ class ModelConfig:
tokenizer
:
SkipValidation
[
str
]
=
None
# type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode
:
TokenizerMode
=
"auto"
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
"""Tokenizer mode:
\n
- "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.
\n
- "hf" will use the fast tokenizer if available.
\n
- "slow" will always use the slow tokenizer.
\n
- "mistral" will always use the tokenizer from `mistral_common`.
\n
-
"custom" will use --tokenizer to select the preregistered tokenizer
."""
-
Other custom values can be supported via plugins
."""
trust_remote_code
:
bool
=
False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
...
...
@@ -708,9 +708,6 @@ class ModelConfig:
# can be correctly capped to sliding window size
self
.
hf_text_config
.
sliding_window
=
None
if
not
self
.
skip_tokenizer_init
:
self
.
_verify_tokenizer_mode
()
# Avoid running try_verify_and_update_config multiple times
self
.
config_updated
=
False
...
...
@@ -718,6 +715,10 @@ class ModelConfig:
self
.
_verify_cuda_graph
()
self
.
_verify_bnb_config
()
@
field_validator
(
"tokenizer_mode"
,
mode
=
"after"
)
def
_lowercase_tokenizer_mode
(
cls
,
tokenizer_mode
:
str
)
->
str
:
return
tokenizer_mode
.
lower
()
@
field_validator
(
"quantization"
,
mode
=
"before"
)
@
classmethod
def
validate_quantization_before
(
cls
,
value
:
Any
)
->
Any
:
...
...
@@ -829,15 +830,6 @@ class ModelConfig:
model
,
_
=
split_remote_gguf
(
model
)
return
get_sentence_transformer_tokenizer_config
(
model
,
self
.
revision
)
def
_verify_tokenizer_mode
(
self
)
->
None
:
tokenizer_mode
=
cast
(
TokenizerMode
,
self
.
tokenizer_mode
.
lower
())
if
tokenizer_mode
not
in
get_args
(
TokenizerMode
):
raise
ValueError
(
f
"Unknown tokenizer mode:
{
self
.
tokenizer_mode
}
. Must be "
f
"one of
{
get_args
(
TokenizerMode
)
}
."
)
self
.
tokenizer_mode
=
tokenizer_mode
def
_get_default_runner_type
(
self
,
architectures
:
list
[
str
],
...
...
vllm/engine/arg_utils.py
View file @
f0a28bf6
...
...
@@ -360,7 +360,7 @@ class EngineArgs:
task
:
TaskOption
|
None
=
ModelConfig
.
task
skip_tokenizer_init
:
bool
=
ModelConfig
.
skip_tokenizer_init
enable_prompt_embeds
:
bool
=
ModelConfig
.
enable_prompt_embeds
tokenizer_mode
:
TokenizerMode
=
ModelConfig
.
tokenizer_mode
tokenizer_mode
:
TokenizerMode
|
str
=
ModelConfig
.
tokenizer_mode
trust_remote_code
:
bool
=
ModelConfig
.
trust_remote_code
allowed_local_media_path
:
str
=
ModelConfig
.
allowed_local_media_path
allowed_media_domains
:
list
[
str
]
|
None
=
ModelConfig
.
allowed_media_domains
...
...
vllm/entrypoints/llm.py
View file @
f0a28bf6
...
...
@@ -188,7 +188,7 @@ class LLM:
runner
:
RunnerOption
=
"auto"
,
convert
:
ConvertOption
=
"auto"
,
tokenizer
:
str
|
None
=
None
,
tokenizer_mode
:
TokenizerMode
=
"auto"
,
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
,
skip_tokenizer_init
:
bool
=
False
,
trust_remote_code
:
bool
=
False
,
allowed_local_media_path
:
str
=
""
,
...
...
vllm/tokenizers/__init__.py
View file @
f0a28bf6
...
...
@@ -4,6 +4,12 @@
from
.hf
import
HfTokenizer
from
.mistral
import
MistralTokenizer
from
.protocol
import
TokenizerLike
from
.registry
import
TokenizerRegistry
from
.registry
import
TokenizerRegistry
,
get_tokenizer
__all__
=
[
"TokenizerLike"
,
"HfTokenizer"
,
"MistralTokenizer"
,
"TokenizerRegistry"
]
__all__
=
[
"TokenizerLike"
,
"HfTokenizer"
,
"MistralTokenizer"
,
"TokenizerRegistry"
,
"get_tokenizer"
,
]
vllm/tokenizers/hf.py
View file @
f0a28bf6
...
...
@@ -10,6 +10,7 @@ from transformers import AutoTokenizer
from
vllm.transformers_utils.config
import
get_sentence_transformer_tokenizer_config
from
.protocol
import
TokenizerLike
from
.registry
import
TokenizerRegistry
if
TYPE_CHECKING
:
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
...
...
@@ -67,6 +68,7 @@ def get_cached_tokenizer(
return
cached_tokenizer
# type: ignore
@
TokenizerRegistry
.
register
(
"hf"
)
class
HfTokenizer
(
TokenizerLike
):
@
classmethod
def
from_pretrained
(
...
...
vllm/tokenizers/mistral.py
View file @
f0a28bf6
...
...
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, Any, cast
from
vllm.logger
import
init_logger
from
.protocol
import
TokenizerLike
from
.registry
import
TokenizerRegistry
if
TYPE_CHECKING
:
from
mistral_common.protocol.instruct.request
import
(
...
...
@@ -165,6 +166,7 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
return
tokenizer
.
unk_id
@
TokenizerRegistry
.
register
(
"mistral"
)
class
MistralTokenizer
(
TokenizerLike
):
@
classmethod
def
from_pretrained
(
...
...
vllm/tokenizers/registry.py
View file @
f0a28bf6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
importlib
import
importlib.util
from
collections.abc
import
Callable
from
pathlib
import
Path
from
typing
import
TypeVar
,
overload
import
huggingface_hub
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.gguf_utils
import
get_gguf_file_path_from_hf
from
vllm.transformers_utils.repo_utils
import
list_filtered_repo_files
from
vllm.transformers_utils.utils
import
(
check_gguf_file
,
is_gguf
,
is_remote_gguf
,
split_remote_gguf
,
)
from
vllm.utils.import_utils
import
resolve_obj_by_qualname
from
.protocol
import
TokenizerLike
logger
=
init_logger
(
__name__
)
_T
=
TypeVar
(
"_T"
,
bound
=
type
[
TokenizerLike
])
class
TokenizerRegistry
:
# Tokenizer name -> (tokenizer module, tokenizer class)
REGISTRY
:
dict
[
str
,
tuple
[
str
,
str
]]
=
{}
# Tokenizer name ->
tokenizer_cls or
(tokenizer module, tokenizer class)
REGISTRY
:
dict
[
str
,
type
[
TokenizerLike
]
|
tuple
[
str
,
str
]]
=
{}
# In-tree tokenizers
@
staticmethod
def
register
(
name
:
str
,
module
:
str
,
class_name
:
str
)
->
None
:
TokenizerRegistry
.
REGISTRY
[
name
]
=
(
module
,
class_name
)
@
overload
def
register
(
tokenizer_mode
:
str
)
->
Callable
[[
_T
],
_T
]:
...
# OOT tokenizers
@
staticmethod
def
get_tokenizer
(
tokenizer_name
:
str
,
@
overload
def
register
(
tokenizer_mode
:
str
,
module
:
str
,
class_name
:
str
)
->
None
:
...
@
staticmethod
def
register
(
tokenizer_mode
:
str
,
module
:
str
|
None
=
None
,
class_name
:
str
|
None
=
None
,
)
->
Callable
[[
_T
],
_T
]
|
None
:
# In-tree tokenizers
if
module
is
None
or
class_name
is
None
:
def
wrapper
(
tokenizer_cls
:
_T
)
->
_T
:
assert
tokenizer_mode
not
in
TokenizerRegistry
.
REGISTRY
TokenizerRegistry
.
REGISTRY
[
tokenizer_mode
]
=
tokenizer_cls
return
tokenizer_cls
return
wrapper
# OOT tokenizers
if
tokenizer_mode
in
TokenizerRegistry
.
REGISTRY
:
logger
.
warning
(
"%s.%s is already registered for tokenizer_mode=%r. "
"It is overwritten by the new one."
,
module
,
class_name
,
tokenizer_mode
,
)
TokenizerRegistry
.
REGISTRY
[
tokenizer_mode
]
=
(
module
,
class_name
)
return
None
@
staticmethod
def
get_tokenizer
(
tokenizer_mode
:
str
,
*
args
,
**
kwargs
)
->
"TokenizerLike"
:
if
tokenizer_mode
not
in
TokenizerRegistry
.
REGISTRY
:
raise
ValueError
(
f
"No tokenizer registered for
{
tokenizer_mode
=
!
r
}
."
)
item
=
TokenizerRegistry
.
REGISTRY
[
tokenizer_mode
]
if
isinstance
(
item
,
type
):
return
item
.
from_pretrained
(
*
args
,
**
kwargs
)
module
,
class_name
=
item
logger
.
debug_once
(
f
"Loading
{
class_name
}
for
{
tokenizer_mode
=
!
r
}
"
)
class_
=
resolve_obj_by_qualname
(
f
"
{
module
}
.
{
class_name
}
"
)
return
class_
.
from_pretrained
(
*
args
,
**
kwargs
)
def
get_tokenizer
(
tokenizer_name
:
str
|
Path
,
*
args
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
revision
:
str
|
None
=
None
,
download_dir
:
str
|
None
=
None
,
**
kwargs
,
)
->
"TokenizerLike"
:
tokenizer_cls
=
TokenizerRegistry
.
REGISTRY
.
get
(
tokenizer_name
)
if
tokenizer_cls
is
None
:
raise
ValueError
(
f
"Tokenizer
{
tokenizer_name
}
not found."
)
)
->
TokenizerLike
:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if
envs
.
VLLM_USE_MODELSCOPE
:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
from
modelscope.hub.snapshot_download
import
snapshot_download
tokenizer_module
=
importlib
.
import_module
(
tokenizer_cls
[
0
])
class_
=
getattr
(
tokenizer_module
,
tokenizer_cls
[
1
])
return
class_
.
from_pretrained
(
*
args
,
**
kwargs
)
# avoid circular import
from
vllm.model_executor.model_loader.weight_utils
import
get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if
not
Path
(
tokenizer_name
).
exists
():
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with
get_lock
(
tokenizer_name
,
download_dir
):
tokenizer_path
=
snapshot_download
(
model_id
=
str
(
tokenizer_name
),
cache_dir
=
download_dir
,
revision
=
revision
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
],
)
tokenizer_name
=
tokenizer_path
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
tokenizer_mode
=
"hf"
kwargs
[
"use_fast"
]
=
False
if
"truncation_side"
not
in
kwargs
:
kwargs
[
"truncation_side"
]
=
"left"
# Separate model folder from file path for GGUF models
if
is_gguf
(
tokenizer_name
):
if
check_gguf_file
(
tokenizer_name
):
kwargs
[
"gguf_file"
]
=
Path
(
tokenizer_name
).
name
tokenizer_name
=
Path
(
tokenizer_name
).
parent
elif
is_remote_gguf
(
tokenizer_name
):
tokenizer_name
,
quant_type
=
split_remote_gguf
(
tokenizer_name
)
# Get the HuggingFace Hub path for the GGUF file
gguf_file
=
get_gguf_file_path_from_hf
(
tokenizer_name
,
quant_type
,
revision
=
revision
,
)
kwargs
[
"gguf_file"
]
=
gguf_file
# Try to use official Mistral tokenizer if possible
if
tokenizer_mode
==
"auto"
and
importlib
.
util
.
find_spec
(
"mistral_common"
):
allow_patterns
=
[
"tekken.json"
,
"tokenizer.model.v*"
]
files_list
=
list_filtered_repo_files
(
model_name_or_path
=
str
(
tokenizer_name
),
allow_patterns
=
allow_patterns
,
revision
=
revision
,
)
if
len
(
files_list
)
>
0
:
tokenizer_mode
=
"mistral"
# Fallback to HF tokenizer
if
tokenizer_mode
==
"auto"
:
tokenizer_mode
=
"hf"
tokenizer_args
=
(
tokenizer_name
,
*
args
)
tokenizer_kwargs
=
dict
(
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
download_dir
=
download_dir
,
**
kwargs
,
)
if
tokenizer_mode
==
"custom"
:
logger
.
warning_once
(
"TokenizerRegistry now uses `tokenizer_mode` as the registry key "
"instead of `tokenizer_name`. "
"Please update the definition of `.from_pretrained` in "
"your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
"Then, you can pass `tokenizer_mode=%r` instead of "
"`tokenizer_mode='custom'` when initializing vLLM."
,
tokenizer_args
,
str
(
tokenizer_kwargs
),
tokenizer_mode
,
)
tokenizer_mode
=
str
(
tokenizer_name
)
tokenizer
=
TokenizerRegistry
.
get_tokenizer
(
tokenizer_mode
,
*
tokenizer_args
,
**
tokenizer_kwargs
,
)
if
not
tokenizer
.
is_fast
:
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return
tokenizer
vllm/transformers_utils/tokenizer.py
View file @
f0a28bf6
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
importlib.util
import
os
import
warnings
from
functools
import
lru_cache
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
import
huggingface_hub
from
typing_extensions
import
assert_never
from
vllm
import
envs
from
vllm.logger
import
init_logger
from
vllm.tokenizers
import
(
HfTokenizer
,
MistralTokenizer
,
TokenizerLike
,
TokenizerRegistry
,
)
from
.gguf_utils
import
get_gguf_file_path_from_hf
from
.repo_utils
import
list_filtered_repo_files
from
.utils
import
check_gguf_file
,
is_gguf
,
is_remote_gguf
,
split_remote_gguf
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
...
...
@@ -108,117 +94,6 @@ def encode_tokens(
return
tokenizer
.
encode
(
text
,
**
kw_args
)
def
get_tokenizer
(
tokenizer_name
:
str
|
Path
,
*
args
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
revision
:
str
|
None
=
None
,
download_dir
:
str
|
None
=
None
,
**
kwargs
,
)
->
TokenizerLike
:
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
if
envs
.
VLLM_USE_MODELSCOPE
:
# download model from ModelScope hub,
# lazy import so that modelscope is not required for normal use.
# pylint: disable=C.
from
modelscope.hub.snapshot_download
import
snapshot_download
# avoid circuit import
from
vllm.model_executor.model_loader.weight_utils
import
get_lock
# Only set the tokenizer here, model will be downloaded on the workers.
if
not
os
.
path
.
exists
(
tokenizer_name
):
# Use file lock to prevent multiple processes from
# downloading the same file at the same time.
with
get_lock
(
tokenizer_name
,
download_dir
):
tokenizer_path
=
snapshot_download
(
model_id
=
tokenizer_name
,
cache_dir
=
download_dir
,
revision
=
revision
,
local_files_only
=
huggingface_hub
.
constants
.
HF_HUB_OFFLINE
,
# Ignore weights - we only need the tokenizer.
ignore_file_pattern
=
[
".*.pt"
,
".*.safetensors"
,
".*.bin"
],
)
tokenizer_name
=
tokenizer_path
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
kwargs
[
"use_fast"
]
=
False
if
"truncation_side"
not
in
kwargs
:
kwargs
[
"truncation_side"
]
=
"left"
# Separate model folder from file path for GGUF models
if
is_gguf
(
tokenizer_name
):
if
check_gguf_file
(
tokenizer_name
):
kwargs
[
"gguf_file"
]
=
Path
(
tokenizer_name
).
name
tokenizer_name
=
Path
(
tokenizer_name
).
parent
elif
is_remote_gguf
(
tokenizer_name
):
tokenizer_name
,
quant_type
=
split_remote_gguf
(
tokenizer_name
)
# Get the HuggingFace Hub path for the GGUF file
gguf_file
=
get_gguf_file_path_from_hf
(
tokenizer_name
,
quant_type
,
revision
=
revision
,
)
kwargs
[
"gguf_file"
]
=
gguf_file
# if `tokenizer_mode` == "auto", check if tokenizer can be loaded via Mistral format
# first to use official Mistral tokenizer if possible.
mistral_common_installed
=
importlib
.
util
.
find_spec
(
"mistral_common"
)
is
not
None
if
tokenizer_mode
==
"auto"
and
mistral_common_installed
:
allow_patterns
=
[
"tekken.json"
,
"tokenizer.model.v*"
]
files_list
=
list_filtered_repo_files
(
model_name_or_path
=
str
(
tokenizer_name
),
allow_patterns
=
allow_patterns
,
revision
=
revision
,
)
if
len
(
files_list
)
>
0
:
tokenizer_mode
=
"mistral"
tokenizer
:
TokenizerLike
if
tokenizer_mode
==
"mistral"
:
logger
.
debug_once
(
f
"Loading MistralTokenizer from
{
tokenizer_name
}
"
)
tokenizer
=
MistralTokenizer
.
from_pretrained
(
tokenizer_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
download_dir
=
download_dir
,
**
kwargs
,
)
elif
tokenizer_mode
==
"custom"
:
logger
.
debug_once
(
f
"Loading CustomTokenizer from
{
tokenizer_name
}
"
)
tokenizer
=
TokenizerRegistry
.
get_tokenizer
(
str
(
tokenizer_name
),
*
args
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
download_dir
=
download_dir
,
**
kwargs
,
)
else
:
logger
.
debug_once
(
f
"Loading HfTokenizer from
{
tokenizer_name
}
"
)
tokenizer
=
HfTokenizer
.
from_pretrained
(
tokenizer_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
,
download_dir
=
download_dir
,
**
kwargs
,
)
if
not
tokenizer
.
is_fast
:
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return
tokenizer
cached_get_tokenizer
=
lru_cache
(
get_tokenizer
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment