Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e83b7e37
Unverified
Commit
e83b7e37
authored
Dec 07, 2025
by
Cyrus Leung
Committed by
GitHub
Dec 07, 2025
Browse files
Revert "[Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145)" (#30199)
parent
27f4c2fd
Changes
105
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
139 additions
and
285 deletions
+139
-285
vllm/config/renderer.py
vllm/config/renderer.py
+0
-109
vllm/config/speculative.py
vllm/config/speculative.py
+5
-0
vllm/config/vllm.py
vllm/config/vllm.py
+10
-15
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+40
-59
vllm/engine/protocol.py
vllm/engine/protocol.py
+1
-2
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+34
-45
vllm/entrypoints/llm.py
vllm/entrypoints/llm.py
+6
-8
vllm/entrypoints/openai/api_server.py
vllm/entrypoints/openai/api_server.py
+1
-1
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+1
-1
vllm/entrypoints/openai/serving_engine.py
vllm/entrypoints/openai/serving_engine.py
+5
-6
vllm/entrypoints/openai/serving_models.py
vllm/entrypoints/openai/serving_models.py
+0
-1
vllm/entrypoints/openai/speech_to_text.py
vllm/entrypoints/openai/speech_to_text.py
+5
-5
vllm/entrypoints/pooling/pooling/serving.py
vllm/entrypoints/pooling/pooling/serving.py
+1
-1
vllm/entrypoints/pooling/score/serving.py
vllm/entrypoints/pooling/score/serving.py
+3
-1
vllm/entrypoints/score_utils.py
vllm/entrypoints/score_utils.py
+5
-8
vllm/entrypoints/utils.py
vllm/entrypoints/utils.py
+4
-4
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+4
-5
vllm/model_executor/models/adapters.py
vllm/model_executor/models/adapters.py
+10
-10
vllm/model_executor/models/deepseek_ocr.py
vllm/model_executor/models/deepseek_ocr.py
+2
-2
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+2
-2
No files found.
vllm/config/renderer.py
deleted
100644 → 0
View file @
27f4c2fd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Any
,
Literal
from
pydantic
import
Field
,
SkipValidation
from
pydantic.dataclasses
import
dataclass
from
vllm.config.model
import
ModelConfig
from
vllm.config.utils
import
config
from
vllm.transformers_utils.gguf_utils
import
is_gguf
from
vllm.transformers_utils.runai_utils
import
ObjectStorageModel
,
is_runai_obj_uri
from
vllm.transformers_utils.utils
import
maybe_model_redirect
TokenizerMode
=
Literal
[
"auto"
,
"hf"
,
"slow"
,
"mistral"
,
"deepseek_v32"
]
@
config
@
dataclass
class
RendererConfig
:
"""Configuration for the renderer."""
# NOTE: In reality, this is a required argument.
# We provide a dummy default value here to generate the CLI args.
model_config
:
SkipValidation
[
ModelConfig
]
=
None
# type: ignore
"""Provides model context to the renderer."""
tokenizer
:
str
=
""
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode
:
TokenizerMode
|
str
=
"auto"
"""Tokenizer mode:
\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.
\n
- "hf" will use the fast tokenizer if available.
\n
- "slow" will always use the slow tokenizer.
\n
- "mistral" will always use the tokenizer from `mistral_common`.
\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.
\n
- Other custom values can be supported via plugins."""
tokenizer_revision
:
str
|
None
=
None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
skip_tokenizer_init
:
bool
=
False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
io_processor_plugin
:
str
|
None
=
None
"""IOProcessor plugin name to load at model startup."""
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
Field
(
default_factory
=
dict
)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
allowed_local_media_path
:
str
=
""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains
:
list
[
str
]
|
None
=
None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
@
property
def
trust_remote_code
(
self
)
->
bool
:
return
self
.
model_config
.
trust_remote_code
def
__post_init__
(
self
)
->
None
:
model_config
=
self
.
model_config
# The tokenizer is consistent with the model by default.
if
not
self
.
tokenizer
:
self
.
tokenizer
=
(
ModelConfig
.
model
if
model_config
is
None
else
model_config
.
original_model
)
if
not
self
.
tokenizer_revision
:
self
.
tokenizer_revision
=
(
ModelConfig
.
revision
if
model_config
is
None
else
model_config
.
revision
)
self
.
original_tokenizer
=
self
.
tokenizer
self
.
tokenizer
=
maybe_model_redirect
(
self
.
original_tokenizer
)
self
.
maybe_pull_tokenizer_for_runai
(
self
.
tokenizer
)
# Multimodal GGUF models must use original repo for mm processing
is_multimodal_model
=
(
ModelConfig
.
is_multimodal_model
if
model_config
is
None
else
model_config
.
is_multimodal_model
)
if
is_gguf
(
self
.
tokenizer
)
and
is_multimodal_model
:
raise
ValueError
(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
def
maybe_pull_tokenizer_for_runai
(
self
,
tokenizer
:
str
)
->
None
:
"""Pull tokenizer from Object Storage to temporary directory when needed."""
if
not
is_runai_obj_uri
(
tokenizer
):
return
object_storage_tokenizer
=
ObjectStorageModel
(
url
=
tokenizer
)
object_storage_tokenizer
.
pull_files
(
tokenizer
,
ignore_pattern
=
[
"*.pt"
,
"*.safetensors"
,
"*.bin"
,
"*.tensors"
,
"*.pth"
],
)
self
.
tokenizer
=
object_storage_tokenizer
.
dir
vllm/config/speculative.py
View file @
e83b7e37
...
...
@@ -322,11 +322,16 @@ class SpeculativeConfig:
self
.
draft_model_config
=
ModelConfig
(
model
=
self
.
model
,
runner
=
"draft"
,
tokenizer
=
self
.
target_model_config
.
tokenizer
,
tokenizer_mode
=
self
.
target_model_config
.
tokenizer_mode
,
trust_remote_code
=
self
.
target_model_config
.
trust_remote_code
,
allowed_local_media_path
=
self
.
target_model_config
.
allowed_local_media_path
,
allowed_media_domains
=
self
.
target_model_config
.
allowed_media_domains
,
dtype
=
self
.
target_model_config
.
dtype
,
seed
=
self
.
target_model_config
.
seed
,
revision
=
self
.
revision
,
code_revision
=
self
.
code_revision
,
tokenizer_revision
=
self
.
target_model_config
.
tokenizer_revision
,
spec_target_max_model_len
=
self
.
target_model_config
.
max_model_len
,
quantization
=
self
.
quantization
,
enforce_eager
=
self
.
target_model_config
.
enforce_eager
,
...
...
vllm/config/vllm.py
View file @
e83b7e37
...
...
@@ -39,7 +39,6 @@ from .lora import LoRAConfig
from
.model
import
ModelConfig
from
.observability
import
ObservabilityConfig
from
.parallel
import
ParallelConfig
from
.renderer
import
RendererConfig
from
.scheduler
import
SchedulerConfig
from
.speculative
import
SpeculativeConfig
from
.structured_outputs
import
StructuredOutputsConfig
...
...
@@ -182,8 +181,6 @@ class VllmConfig:
# try to download a model
model_config
:
ModelConfig
=
Field
(
default
=
None
)
"""Model configuration."""
renderer_config
:
RendererConfig
=
Field
(
default_factory
=
RendererConfig
)
"""Renderer configuration."""
cache_config
:
CacheConfig
=
Field
(
default_factory
=
CacheConfig
)
"""Cache configuration."""
parallel_config
:
ParallelConfig
=
Field
(
default_factory
=
ParallelConfig
)
...
...
@@ -744,7 +741,7 @@ class VllmConfig:
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
self
.
scheduler_config
.
max_num_encoder_input_tokens
=
(
MULTIMODAL_REGISTRY
.
get_encdec_max_encoder_len
(
self
.
renderer
_config
)
MULTIMODAL_REGISTRY
.
get_encdec_max_encoder_len
(
self
.
model
_config
)
)
logger
.
debug
(
"Encoder-decoder model detected: setting "
...
...
@@ -1189,13 +1186,11 @@ class VllmConfig:
computed_compile_ranges_split_points
)
def
recalculate_max_model_len
(
self
,
original_max_model_len
:
int
|
None
)
->
None
:
# Can only be called during try_verify_and_update_config
self
.
model_config
.
recalculate_max_model_len
(
original_max_model_len
,
tokenizer
=
self
.
renderer_config
.
tokenizer
,
tokenizer_revision
=
self
.
renderer_config
.
tokenizer_revision
,
)
def
recalculate_max_model_len
(
self
,
max_model_len
:
int
):
# Can only be called in try_verify_and_update_config
model_config
=
self
.
model_config
max_model_len
=
model_config
.
get_and_verify_max_len
(
max_model_len
)
self
.
model_config
.
max_model_len
=
max_model_len
def
try_verify_and_update_config
(
self
):
if
self
.
model_config
is
None
:
...
...
@@ -1269,11 +1264,11 @@ class VllmConfig:
return
(
f
"model=
{
self
.
model_config
.
model
!
r
}
, "
f
"speculative_config=
{
self
.
speculative_config
!
r
}
, "
f
"tokenizer=
{
self
.
renderer
_config
.
tokenizer
!
r
}
, "
f
"skip_tokenizer_init=
{
self
.
renderer
_config
.
skip_tokenizer_init
}
, "
f
"tokenizer_mode=
{
self
.
renderer
_config
.
tokenizer_mode
}
, "
f
"tokenizer=
{
self
.
model
_config
.
tokenizer
!
r
}
, "
f
"skip_tokenizer_init=
{
self
.
model
_config
.
skip_tokenizer_init
}
, "
f
"tokenizer_mode=
{
self
.
model
_config
.
tokenizer_mode
}
, "
f
"revision=
{
self
.
model_config
.
revision
}
, "
f
"tokenizer_revision=
{
self
.
renderer
_config
.
tokenizer_revision
}
, "
f
"tokenizer_revision=
{
self
.
model
_config
.
tokenizer_revision
}
, "
f
"trust_remote_code=
{
self
.
model_config
.
trust_remote_code
}
, "
f
"dtype=
{
self
.
model_config
.
dtype
}
, "
f
"max_seq_len=
{
self
.
model_config
.
max_model_len
}
, "
...
...
vllm/engine/arg_utils.py
View file @
e83b7e37
...
...
@@ -71,11 +71,11 @@ from vllm.config.model import (
ModelDType
,
RunnerOption
,
TaskOption
,
TokenizerMode
,
)
from
vllm.config.multimodal
import
MMCacheType
,
MMEncoderTPMode
from
vllm.config.observability
import
DetailedTraceModules
from
vllm.config.parallel
import
DistributedExecutorBackend
,
ExpertPlacementStrategy
from
vllm.config.renderer
import
RendererConfig
,
TokenizerMode
from
vllm.config.scheduler
import
SchedulerPolicy
from
vllm.config.utils
import
get_field
from
vllm.config.vllm
import
OptimizationLevel
...
...
@@ -355,12 +355,17 @@ class EngineArgs:
model
:
str
=
ModelConfig
.
model
served_model_name
:
str
|
list
[
str
]
|
None
=
ModelConfig
.
served_model_name
tokenizer
:
str
|
None
=
ModelConfig
.
tokenizer
hf_config_path
:
str
|
None
=
ModelConfig
.
hf_config_path
runner
:
RunnerOption
=
ModelConfig
.
runner
convert
:
ConvertOption
=
ModelConfig
.
convert
task
:
TaskOption
|
None
=
ModelConfig
.
task
skip_tokenizer_init
:
bool
=
ModelConfig
.
skip_tokenizer_init
enable_prompt_embeds
:
bool
=
ModelConfig
.
enable_prompt_embeds
tokenizer_mode
:
TokenizerMode
|
str
=
ModelConfig
.
tokenizer_mode
trust_remote_code
:
bool
=
ModelConfig
.
trust_remote_code
allowed_local_media_path
:
str
=
ModelConfig
.
allowed_local_media_path
allowed_media_domains
:
list
[
str
]
|
None
=
ModelConfig
.
allowed_media_domains
download_dir
:
str
|
None
=
LoadConfig
.
download_dir
safetensors_load_strategy
:
str
=
LoadConfig
.
safetensors_load_strategy
load_format
:
str
|
LoadFormats
=
LoadConfig
.
load_format
...
...
@@ -444,6 +449,7 @@ class EngineArgs:
code_revision
:
str
|
None
=
ModelConfig
.
code_revision
hf_token
:
bool
|
str
|
None
=
ModelConfig
.
hf_token
hf_overrides
:
HfOverrides
=
get_field
(
ModelConfig
,
"hf_overrides"
)
tokenizer_revision
:
str
|
None
=
ModelConfig
.
tokenizer_revision
quantization
:
QuantizationMethods
|
None
=
ModelConfig
.
quantization
enforce_eager
:
bool
=
ModelConfig
.
enforce_eager
disable_custom_all_reduce
:
bool
=
ParallelConfig
.
disable_custom_all_reduce
...
...
@@ -452,6 +458,9 @@ class EngineArgs:
)
enable_mm_embeds
:
bool
=
MultiModalConfig
.
enable_mm_embeds
interleave_mm_strings
:
bool
=
MultiModalConfig
.
interleave_mm_strings
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
get_field
(
MultiModalConfig
,
"media_io_kwargs"
)
mm_processor_kwargs
:
dict
[
str
,
Any
]
|
None
=
MultiModalConfig
.
mm_processor_kwargs
disable_mm_preprocessor_cache
:
bool
=
False
# DEPRECATED
mm_processor_cache_gb
:
float
=
MultiModalConfig
.
mm_processor_cache_gb
...
...
@@ -465,19 +474,9 @@ class EngineArgs:
mm_encoder_attn_backend
:
AttentionBackendEnum
|
str
|
None
=
(
MultiModalConfig
.
mm_encoder_attn_backend
)
io_processor_plugin
:
str
|
None
=
None
skip_mm_profiling
:
bool
=
MultiModalConfig
.
skip_mm_profiling
video_pruning_rate
:
float
=
MultiModalConfig
.
video_pruning_rate
# Renderer fields
tokenizer
:
str
|
None
=
None
tokenizer_mode
:
TokenizerMode
|
str
=
RendererConfig
.
tokenizer_mode
tokenizer_revision
:
str
|
None
=
RendererConfig
.
tokenizer_revision
skip_tokenizer_init
:
bool
=
RendererConfig
.
skip_tokenizer_init
io_processor_plugin
:
str
|
None
=
None
media_io_kwargs
:
dict
[
str
,
dict
[
str
,
Any
]]
=
get_field
(
RendererConfig
,
"media_io_kwargs"
)
allowed_local_media_path
:
str
=
RendererConfig
.
allowed_local_media_path
allowed_media_domains
:
list
[
str
]
|
None
=
RendererConfig
.
allowed_media_domains
# LoRA fields
enable_lora
:
bool
=
False
max_loras
:
int
=
LoRAConfig
.
max_loras
...
...
@@ -628,14 +627,25 @@ class EngineArgs:
model_group
.
add_argument
(
"--runner"
,
**
model_kwargs
[
"runner"
])
model_group
.
add_argument
(
"--convert"
,
**
model_kwargs
[
"convert"
])
model_group
.
add_argument
(
"--task"
,
**
model_kwargs
[
"task"
],
deprecated
=
True
)
model_group
.
add_argument
(
"--tokenizer"
,
**
model_kwargs
[
"tokenizer"
])
model_group
.
add_argument
(
"--tokenizer-mode"
,
**
model_kwargs
[
"tokenizer_mode"
])
model_group
.
add_argument
(
"--trust-remote-code"
,
**
model_kwargs
[
"trust_remote_code"
]
)
model_group
.
add_argument
(
"--dtype"
,
**
model_kwargs
[
"dtype"
])
model_group
.
add_argument
(
"--seed"
,
**
model_kwargs
[
"seed"
])
model_group
.
add_argument
(
"--hf-config-path"
,
**
model_kwargs
[
"hf_config_path"
])
model_group
.
add_argument
(
"--allowed-local-media-path"
,
**
model_kwargs
[
"allowed_local_media_path"
]
)
model_group
.
add_argument
(
"--allowed-media-domains"
,
**
model_kwargs
[
"allowed_media_domains"
]
)
model_group
.
add_argument
(
"--revision"
,
**
model_kwargs
[
"revision"
])
model_group
.
add_argument
(
"--code-revision"
,
**
model_kwargs
[
"code_revision"
])
model_group
.
add_argument
(
"--tokenizer-revision"
,
**
model_kwargs
[
"tokenizer_revision"
]
)
model_group
.
add_argument
(
"--max-model-len"
,
**
model_kwargs
[
"max_model_len"
])
model_group
.
add_argument
(
"--quantization"
,
"-q"
,
**
model_kwargs
[
"quantization"
])
model_group
.
add_argument
(
"--enforce-eager"
,
**
model_kwargs
[
"enforce_eager"
])
...
...
@@ -647,6 +657,9 @@ class EngineArgs:
model_group
.
add_argument
(
"--disable-cascade-attn"
,
**
model_kwargs
[
"disable_cascade_attn"
]
)
model_group
.
add_argument
(
"--skip-tokenizer-init"
,
**
model_kwargs
[
"skip_tokenizer_init"
]
)
model_group
.
add_argument
(
"--enable-prompt-embeds"
,
**
model_kwargs
[
"enable_prompt_embeds"
]
)
...
...
@@ -685,34 +698,8 @@ class EngineArgs:
model_group
.
add_argument
(
"--logits-processors"
,
**
model_kwargs
[
"logits_processors"
]
)
# Renderer arguments
renderer_kwargs
=
get_kwargs
(
RendererConfig
)
renderer_group
=
parser
.
add_argument_group
(
title
=
"RendererConfig"
,
description
=
RendererConfig
.
__doc__
,
)
renderer_group
.
add_argument
(
"--tokenizer"
,
**
renderer_kwargs
[
"tokenizer"
])
renderer_group
.
add_argument
(
"--tokenizer-mode"
,
**
renderer_kwargs
[
"tokenizer_mode"
]
)
renderer_group
.
add_argument
(
"--tokenizer-revision"
,
**
renderer_kwargs
[
"tokenizer_revision"
]
)
renderer_group
.
add_argument
(
"--skip-tokenizer-init"
,
**
renderer_kwargs
[
"skip_tokenizer_init"
]
)
renderer_group
.
add_argument
(
"--media-io-kwargs"
,
**
renderer_kwargs
[
"media_io_kwargs"
]
)
renderer_group
.
add_argument
(
"--allowed-local-media-path"
,
**
renderer_kwargs
[
"allowed_local_media_path"
]
)
renderer_group
.
add_argument
(
"--allowed-media-domains"
,
**
renderer_kwargs
[
"allowed_media_domains"
]
)
renderer_group
.
add_argument
(
"--io-processor-plugin"
,
**
renderer_kwargs
[
"io_processor_plugin"
]
model_group
.
add_argument
(
"--io-processor-plugin"
,
**
model_kwargs
[
"io_processor_plugin"
]
)
# Model loading arguments
...
...
@@ -962,6 +949,9 @@ class EngineArgs:
multimodal_group
.
add_argument
(
"--enable-mm-embeds"
,
**
multimodal_kwargs
[
"enable_mm_embeds"
]
)
multimodal_group
.
add_argument
(
"--media-io-kwargs"
,
**
multimodal_kwargs
[
"media_io_kwargs"
]
)
multimodal_group
.
add_argument
(
"--mm-processor-kwargs"
,
**
multimodal_kwargs
[
"mm_processor_kwargs"
]
)
...
...
@@ -1265,13 +1255,18 @@ class EngineArgs:
runner
=
self
.
runner
,
convert
=
self
.
convert
,
task
=
self
.
task
,
tokenizer
=
self
.
tokenizer
,
tokenizer_mode
=
self
.
tokenizer_mode
,
trust_remote_code
=
self
.
trust_remote_code
,
allowed_local_media_path
=
self
.
allowed_local_media_path
,
allowed_media_domains
=
self
.
allowed_media_domains
,
dtype
=
self
.
dtype
,
seed
=
self
.
seed
,
revision
=
self
.
revision
,
code_revision
=
self
.
code_revision
,
hf_token
=
self
.
hf_token
,
hf_overrides
=
self
.
hf_overrides
,
tokenizer_revision
=
self
.
tokenizer_revision
,
max_model_len
=
self
.
max_model_len
,
quantization
=
self
.
quantization
,
enforce_eager
=
self
.
enforce_eager
,
...
...
@@ -1279,11 +1274,13 @@ class EngineArgs:
logprobs_mode
=
self
.
logprobs_mode
,
disable_sliding_window
=
self
.
disable_sliding_window
,
disable_cascade_attn
=
self
.
disable_cascade_attn
,
skip_tokenizer_init
=
self
.
skip_tokenizer_init
,
enable_prompt_embeds
=
self
.
enable_prompt_embeds
,
served_model_name
=
self
.
served_model_name
,
limit_mm_per_prompt
=
self
.
limit_mm_per_prompt
,
enable_mm_embeds
=
self
.
enable_mm_embeds
,
interleave_mm_strings
=
self
.
interleave_mm_strings
,
media_io_kwargs
=
self
.
media_io_kwargs
,
skip_mm_profiling
=
self
.
skip_mm_profiling
,
config_format
=
self
.
config_format
,
mm_processor_kwargs
=
self
.
mm_processor_kwargs
,
...
...
@@ -1301,6 +1298,7 @@ class EngineArgs:
override_attention_dtype
=
self
.
override_attention_dtype
,
logits_processors
=
self
.
logits_processors
,
video_pruning_rate
=
self
.
video_pruning_rate
,
io_processor_plugin
=
self
.
io_processor_plugin
,
)
def
validate_tensorizer_args
(
self
):
...
...
@@ -1396,25 +1394,9 @@ class EngineArgs:
)
model_config
=
self
.
create_model_config
()
renderer_config
=
RendererConfig
(
model_config
=
model_config
,
tokenizer
=
self
.
tokenizer
or
""
,
tokenizer_mode
=
self
.
tokenizer_mode
,
tokenizer_revision
=
self
.
tokenizer_revision
,
skip_tokenizer_init
=
self
.
skip_tokenizer_init
,
io_processor_plugin
=
self
.
io_processor_plugin
,
media_io_kwargs
=
self
.
media_io_kwargs
,
allowed_local_media_path
=
self
.
allowed_local_media_path
,
allowed_media_domains
=
self
.
allowed_media_domains
,
)
model_config
.
recalculate_max_model_len
(
model_config
.
original_max_model_len
,
tokenizer
=
renderer_config
.
tokenizer
,
tokenizer_revision
=
renderer_config
.
tokenizer_revision
,
)
self
.
model
=
model_config
.
model
self
.
tokenizer
=
model_config
.
tokenizer
self
.
_check_feature_supported
(
model_config
)
self
.
_set_default_chunked_prefill_and_prefix_caching_args
(
model_config
)
self
.
_set_default_max_num_seqs_and_batched_tokens_args
(
...
...
@@ -1786,7 +1768,6 @@ class EngineArgs:
)
config
=
VllmConfig
(
model_config
=
model_config
,
renderer_config
=
renderer_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
...
...
vllm/engine/protocol.py
View file @
e83b7e37
...
...
@@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from
collections.abc
import
AsyncGenerator
,
Iterable
,
Mapping
from
typing
import
Any
from
vllm.config
import
ModelConfig
,
RendererConfig
,
VllmConfig
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.inputs.data
import
PromptType
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
PoolingRequestOutput
,
RequestOutput
...
...
@@ -22,7 +22,6 @@ class EngineClient(ABC):
"""Protocol class for Clients to Engine"""
vllm_config
:
VllmConfig
renderer_config
:
RendererConfig
model_config
:
ModelConfig
input_processor
:
InputProcessor
io_processor
:
IOProcessor
|
None
...
...
vllm/entrypoints/chat_utils.py
View file @
e83b7e37
...
...
@@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
from
typing_extensions
import
Required
,
TypedDict
from
vllm
import
envs
from
vllm.config
import
ModelConfig
,
RendererConfig
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.models
import
SupportsMultiModal
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalDataDict
,
MultiModalUUIDDict
...
...
@@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
def
_try_get_processor_chat_template
(
tokenizer
:
PreTrainedTokenizer
|
PreTrainedTokenizerFast
,
*
,
trust_remote_code
:
bool
,
model_config
:
ModelConfig
,
)
->
str
|
None
:
cache_key
=
(
tokenizer
.
name_or_path
,
trust_remote_code
)
cache_key
=
(
tokenizer
.
name_or_path
,
model_config
.
trust_remote_code
)
if
cache_key
in
_PROCESSOR_CHAT_TEMPLATES
:
return
_PROCESSOR_CHAT_TEMPLATES
[
cache_key
]
...
...
@@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
PreTrainedTokenizerFast
,
ProcessorMixin
,
),
trust_remote_code
=
trust_remote_code
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
if
(
isinstance
(
processor
,
ProcessorMixin
)
...
...
@@ -500,10 +499,7 @@ def resolve_hf_chat_template(
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
if
tools
is
None
:
chat_template
=
_try_get_processor_chat_template
(
tokenizer
,
trust_remote_code
=
model_config
.
trust_remote_code
,
)
chat_template
=
_try_get_processor_chat_template
(
tokenizer
,
model_config
)
if
chat_template
is
not
None
:
return
chat_template
...
...
@@ -517,10 +513,10 @@ def resolve_hf_chat_template(
exc_info
=
True
,
)
# 4th priority: Predefined fallbacks
]
# 4th priority: Predefined fallbacks
path
=
get_chat_template_fallback_path
(
model_type
=
model_config
.
hf_config
.
model_type
,
tokenizer_name_or_path
=
tokenizer
.
name_or_path
,
tokenizer_name_or_path
=
model_config
.
tokenizer
,
)
if
path
is
not
None
:
logger
.
info_once
(
...
...
@@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
tokenizer
:
TokenizerLike
|
None
,
*
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
)
->
_ChatTemplateContentFormat
:
if
isinstance
(
tokenizer
,
(
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)):
hf_chat_template
=
resolve_hf_chat_template
(
tokenizer
,
chat_template
=
chat_template
,
tools
=
tools
,
model_config
=
renderer_config
.
model_config
,
model_config
=
model_config
,
)
else
:
hf_chat_template
=
None
...
...
@@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
given_format
:
ChatTemplateContentFormatOption
,
tokenizer
:
TokenizerLike
|
None
,
*
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
)
->
_ChatTemplateContentFormat
:
if
given_format
!=
"auto"
:
return
given_format
...
...
@@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
chat_template
,
tools
,
tokenizer
,
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
)
_log_chat_template_content_format
(
...
...
@@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
maximum per prompt.
"""
def
__init__
(
self
,
renderer
_config
:
Renderer
Config
):
def
__init__
(
self
,
model
_config
:
Model
Config
):
super
().
__init__
()
self
.
_
renderer
_config
=
renderer
_config
self
.
_
model
_config
=
model
_config
self
.
_items_by_modality
=
defaultdict
[
str
,
list
[
_T
|
None
]](
list
)
self
.
_uuids_by_modality
=
defaultdict
[
str
,
list
[
str
|
None
]](
list
)
@
property
def
renderer
_config
(
self
)
->
Renderer
Config
:
return
self
.
_
renderer
_config
def
model
_config
(
self
)
->
Model
Config
:
return
self
.
_
model
_config
@
cached_property
def
model_cls
(
self
)
->
type
[
SupportsMultiModal
]:
from
vllm.model_executor.model_loader
import
get_model_cls
model_cls
=
get_model_cls
(
self
.
renderer_config
.
model_config
)
model_cls
=
get_model_cls
(
self
.
model_config
)
return
cast
(
type
[
SupportsMultiModal
],
model_cls
)
@
property
def
allowed_local_media_path
(
self
):
return
self
.
_
renderer
_config
.
allowed_local_media_path
return
self
.
_
model
_config
.
allowed_local_media_path
@
property
def
allowed_media_domains
(
self
):
return
self
.
_
renderer
_config
.
allowed_media_domains
return
self
.
_
model
_config
.
allowed_media_domains
@
property
def
mm_registry
(
self
):
...
...
@@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
@
cached_property
def
mm_processor
(
self
):
return
self
.
mm_registry
.
create_processor
(
self
.
renderer
_config
)
return
self
.
mm_registry
.
create_processor
(
self
.
model
_config
)
def
add
(
self
,
...
...
@@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
super
().
__init__
()
self
.
_tracker
=
tracker
multimodal_config
=
self
.
_tracker
.
model_config
.
multimodal_config
media_io_kwargs
=
getattr
(
multimodal_config
,
"media_io_kwargs"
,
None
)
self
.
_connector
:
MediaConnector
=
MEDIA_CONNECTOR_REGISTRY
.
load
(
envs
.
VLLM_MEDIA_CONNECTOR
,
media_io_kwargs
=
self
.
renderer_config
.
media_io_kwargs
,
media_io_kwargs
=
media_io_kwargs
,
allowed_local_media_path
=
tracker
.
allowed_local_media_path
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
@
property
def
renderer_config
(
self
)
->
RendererConfig
:
return
self
.
_tracker
.
renderer_config
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
renderer_config
.
model_config
return
self
.
_tracker
.
model_config
def
parse_image
(
self
,
image_url
:
str
|
None
,
uuid
:
str
|
None
=
None
)
->
None
:
image
=
self
.
_connector
.
fetch_image
(
image_url
)
if
image_url
else
None
...
...
@@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
super
().
__init__
()
self
.
_tracker
=
tracker
multimodal_config
=
self
.
_tracker
.
model_config
.
multimodal_config
media_io_kwargs
=
getattr
(
multimodal_config
,
"media_io_kwargs"
,
None
)
self
.
_connector
:
MediaConnector
=
MEDIA_CONNECTOR_REGISTRY
.
load
(
envs
.
VLLM_MEDIA_CONNECTOR
,
media_io_kwargs
=
self
.
renderer_config
.
media_io_kwargs
,
media_io_kwargs
=
media_io_kwargs
,
allowed_local_media_path
=
tracker
.
allowed_local_media_path
,
allowed_media_domains
=
tracker
.
allowed_media_domains
,
)
@
property
def
renderer_config
(
self
)
->
RendererConfig
:
return
self
.
_tracker
.
renderer_config
@
property
def
model_config
(
self
)
->
ModelConfig
:
return
self
.
renderer_config
.
model_config
return
self
.
_tracker
.
model_config
def
parse_image
(
self
,
image_url
:
str
|
None
,
uuid
:
str
|
None
=
None
)
->
None
:
image_coro
=
self
.
_connector
.
fetch_image_async
(
image_url
)
if
image_url
else
None
...
...
@@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
def
parse_chat_messages
(
messages
:
list
[
ChatCompletionMessageParam
],
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
content_format
:
_ChatTemplateContentFormat
,
)
->
tuple
[
list
[
ConversationMessage
],
MultiModalDataDict
|
None
,
MultiModalUUIDDict
|
None
,
]:
model_config
=
renderer_config
.
model_config
conversation
:
list
[
ConversationMessage
]
=
[]
mm_tracker
=
MultiModalItemTracker
(
renderer
_config
)
mm_tracker
=
MultiModalItemTracker
(
model
_config
)
for
msg
in
messages
:
sub_messages
=
_parse_chat_message_content
(
...
...
@@ -1644,17 +1635,15 @@ def parse_chat_messages(
def
parse_chat_messages_futures
(
messages
:
list
[
ChatCompletionMessageParam
],
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
content_format
:
_ChatTemplateContentFormat
,
)
->
tuple
[
list
[
ConversationMessage
],
Awaitable
[
MultiModalDataDict
|
None
],
MultiModalUUIDDict
|
None
,
]:
model_config
=
renderer_config
.
model_config
conversation
:
list
[
ConversationMessage
]
=
[]
mm_tracker
=
AsyncMultiModalItemTracker
(
renderer
_config
)
mm_tracker
=
AsyncMultiModalItemTracker
(
model
_config
)
for
msg
in
messages
:
sub_messages
=
_parse_chat_message_content
(
...
...
@@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
chat_template
:
str
|
None
,
tools
:
list
[
dict
[
str
,
Any
]]
|
None
,
*
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
**
kwargs
:
Any
,
)
->
str
:
hf_chat_template
=
resolve_hf_chat_template
(
tokenizer
,
chat_template
=
chat_template
,
tools
=
tools
,
model_config
=
renderer_config
.
model_config
,
model_config
=
model_config
,
)
if
hf_chat_template
is
None
:
...
...
vllm/entrypoints/llm.py
View file @
e83b7e37
...
...
@@ -29,8 +29,8 @@ from vllm.config.model import (
HfOverrides
,
ModelDType
,
RunnerOption
,
TokenizerMode
,
)
from
vllm.config.renderer
import
TokenizerMode
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.entrypoints.chat_utils
import
(
ChatCompletionMessageParam
,
...
...
@@ -343,7 +343,6 @@ class LLM:
logger
.
info
(
"Supported tasks: %s"
,
supported_tasks
)
self
.
supported_tasks
=
supported_tasks
self
.
renderer_config
=
self
.
llm_engine
.
renderer_config
self
.
model_config
=
self
.
llm_engine
.
model_config
self
.
input_processor
=
self
.
llm_engine
.
input_processor
self
.
io_processor
=
self
.
llm_engine
.
io_processor
...
...
@@ -809,13 +808,13 @@ class LLM:
list_of_messages
=
[
cast
(
list
[
ChatCompletionMessageParam
],
messages
)]
tokenizer
=
self
.
get_tokenizer
()
renderer
_config
=
self
.
renderer
_config
model
_config
=
self
.
model
_config
resolved_content_format
=
resolve_chat_template_content_format
(
chat_template
,
tools
,
chat_template_content_format
,
tokenizer
,
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
)
_chat_template_kwargs
:
dict
[
str
,
Any
]
=
dict
(
...
...
@@ -834,7 +833,7 @@ class LLM:
# the chat message parsing for it.
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
msgs
,
renderer
_config
,
model
_config
,
content_format
=
resolved_content_format
,
)
...
...
@@ -848,7 +847,7 @@ class LLM:
prompt_str
=
apply_hf_chat_template
(
tokenizer
=
tokenizer
,
conversation
=
conversation
,
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
**
_chat_template_kwargs
,
)
# Special tokens are already included in chat templates so
...
...
@@ -1291,7 +1290,6 @@ class LLM:
lora_request
:
list
[
LoRARequest
]
|
LoRARequest
|
None
=
None
,
tokenization_kwargs
:
dict
[
str
,
Any
]
|
None
=
None
,
)
->
list
[
ScoringRequestOutput
]:
renderer_config
=
self
.
renderer_config
model_config
=
self
.
model_config
if
isinstance
(
tokenizer
,
MistralTokenizer
):
...
...
@@ -1319,7 +1317,7 @@ class LLM:
for
q
,
d
in
input_pairs
:
_
,
engine_prompt
=
get_score_prompt
(
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
data_1
=
q
,
data_2
=
d
,
tokenizer
=
tokenizer
,
...
...
vllm/entrypoints/openai/api_server.py
View file @
e83b7e37
...
...
@@ -1099,7 +1099,7 @@ async def init_app_state(
logger
.
info
(
"Supported tasks: %s"
,
supported_tasks
)
resolved_chat_template
=
await
process_chat_template
(
args
.
chat_template
,
engine_client
,
vllm_config
.
renderer
_config
args
.
chat_template
,
engine_client
,
vllm_config
.
model
_config
)
if
args
.
tool_server
==
"demo"
:
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
e83b7e37
...
...
@@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
try
:
lora_request
=
self
.
_maybe_get_adapters
(
request
)
if
self
.
renderer
_config
.
skip_tokenizer_init
:
if
self
.
model
_config
.
skip_tokenizer_init
:
tokenizer
=
None
else
:
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
...
...
vllm/entrypoints/openai/serving_engine.py
View file @
e83b7e37
...
...
@@ -291,7 +291,6 @@ class OpenAIServing:
self
.
input_processor
=
self
.
models
.
input_processor
self
.
io_processor
=
self
.
models
.
io_processor
self
.
renderer_config
=
self
.
models
.
renderer_config
self
.
model_config
=
self
.
models
.
model_config
self
.
max_model_len
=
self
.
model_config
.
max_model_len
...
...
@@ -1101,18 +1100,18 @@ class OpenAIServing:
Sequence
[
RequestPrompt
],
list
[
EngineTokensPrompt
],
]:
renderer
_config
=
self
.
renderer
_config
model
_config
=
self
.
model
_config
resolved_content_format
=
resolve_chat_template_content_format
(
chat_template
,
tool_dicts
,
chat_template_content_format
,
tokenizer
,
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
)
conversation
,
mm_data_future
,
mm_uuids
=
parse_chat_messages_futures
(
messages
,
renderer
_config
,
model
_config
,
content_format
=
resolved_content_format
,
)
...
...
@@ -1139,14 +1138,14 @@ class OpenAIServing:
request_prompt
=
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
messages
=
messages
,
model_config
=
renderer_config
.
model_config
,
model_config
=
model_config
,
**
_chat_template_kwargs
,
)
else
:
request_prompt
=
apply_hf_chat_template
(
tokenizer
=
tokenizer
,
conversation
=
conversation
,
renderer_config
=
renderer
_config
,
model_config
=
model
_config
,
**
_chat_template_kwargs
,
)
...
...
vllm/entrypoints/openai/serving_models.py
View file @
e83b7e37
...
...
@@ -71,7 +71,6 @@ class OpenAIServingModels:
self
.
input_processor
=
self
.
engine_client
.
input_processor
self
.
io_processor
=
self
.
engine_client
.
io_processor
self
.
renderer_config
=
self
.
engine_client
.
renderer_config
self
.
model_config
=
self
.
engine_client
.
model_config
self
.
max_model_len
=
self
.
model_config
.
max_model_len
...
...
vllm/entrypoints/openai/speech_to_text.py
View file @
e83b7e37
...
...
@@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
self
.
task_type
=
task_type
self
.
asr_config
=
self
.
model_cls
.
get_speech_to_text_config
(
self
.
renderer
_config
,
task_type
self
.
model
_config
,
task_type
)
self
.
enable_force_include_usage
=
enable_force_include_usage
...
...
@@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
self
.
tokenizer
=
cast
(
PreTrainedTokenizerBase
,
get_tokenizer
(
tokenizer_name
=
self
.
renderer
_config
.
tokenizer
,
tokenizer_mode
=
self
.
renderer
_config
.
tokenizer_mode
,
tokenizer_name
=
self
.
model
_config
.
tokenizer
,
tokenizer_mode
=
self
.
model
_config
.
tokenizer_mode
,
),
)
...
...
@@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
prompt
=
self
.
model_cls
.
get_generation_prompt
(
audio
=
chunk
,
stt_config
=
self
.
asr_config
,
renderer
_config
=
self
.
renderer
_config
,
model
_config
=
self
.
model
_config
,
language
=
language
,
task_type
=
self
.
task_type
,
request_prompt
=
request
.
prompt
,
...
...
@@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
if
res
.
prompt_token_ids
is
not
None
:
num_prompt_tokens
=
len
(
res
.
prompt_token_ids
)
if
audio_tokens
:
=
self
.
model_cls
.
get_num_audio_tokens
(
audio_duration_s
,
self
.
asr_config
,
self
.
renderer
_config
audio_duration_s
,
self
.
asr_config
,
self
.
model
_config
):
num_prompt_tokens
+=
audio_tokens
...
...
vllm/entrypoints/pooling/pooling/serving.py
View file @
e83b7e37
...
...
@@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
try
:
lora_request
=
self
.
_maybe_get_adapters
(
request
)
if
self
.
renderer
_config
.
skip_tokenizer_init
:
if
self
.
model
_config
.
skip_tokenizer_init
:
tokenizer
=
None
else
:
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
...
...
vllm/entrypoints/pooling/score/serving.py
View file @
e83b7e37
...
...
@@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
data_1
:
str
|
ScoreContentPartParam
,
data_2
:
str
|
ScoreContentPartParam
,
)
->
tuple
[
str
,
TokensPrompt
]:
model_config
=
self
.
model_config
full_prompt
,
engine_prompt
=
get_score_prompt
(
renderer_config
=
self
.
renderer
_config
,
model_config
=
model
_config
,
data_1
=
data_1
,
data_2
=
data_2
,
tokenizer
=
tokenizer
,
...
...
vllm/entrypoints/score_utils.py
View file @
e83b7e37
...
...
@@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
from
torch.nn
import
CosineSimilarity
from
typing_extensions
import
Required
,
TypedDict
from
vllm.config
import
ModelConfig
,
RendererConfig
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.chat_utils
import
(
BaseMultiModalItemTracker
,
ChatCompletionContentPartImageEmbedsParam
,
...
...
@@ -88,9 +88,9 @@ def _validate_score_input_lens(
def
parse_score_data
(
data_1
:
str
|
ScoreContentPartParam
,
data_2
:
str
|
ScoreContentPartParam
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
)
->
tuple
[
str
,
str
,
MultiModalDataDict
|
None
]:
mm_tracker
=
MultiModalItemTracker
(
renderer
_config
)
mm_tracker
=
MultiModalItemTracker
(
model
_config
)
content_1
=
_parse_score_content
(
data_1
,
mm_tracker
)
content_2
=
_parse_score_content
(
data_2
,
mm_tracker
)
...
...
@@ -176,7 +176,7 @@ def post_process_tokens(
def
get_score_prompt
(
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
tokenizer
:
TokenizerLike
,
tokenization_kwargs
:
dict
[
str
,
Any
],
data_1
:
str
|
ScoreContentPartParam
,
...
...
@@ -185,14 +185,11 @@ def get_score_prompt(
prompt_1
,
prompt_2
,
mm_data
=
parse_score_data
(
data_1
,
data_2
,
renderer
_config
,
model
_config
,
)
from
vllm.model_executor.model_loader
import
get_model_cls
model_config
=
renderer_config
.
model_config
model
=
get_model_cls
(
model_config
)
if
supports_score_template
(
model
):
full_prompt
=
apply_score_template
(
model_config
,
prompt_1
,
prompt_2
)
prompt_inputs
=
tokenizer
(
full_prompt
,
**
tokenization_kwargs
)
...
...
vllm/entrypoints/utils.py
View file @
e83b7e37
...
...
@@ -13,7 +13,7 @@ from fastapi import Request
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
starlette.background
import
BackgroundTask
,
BackgroundTasks
from
vllm.config
import
Renderer
Config
from
vllm.config
import
Model
Config
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.protocol
import
EngineClient
from
vllm.entrypoints.chat_utils
import
(
...
...
@@ -288,7 +288,7 @@ def process_lora_modules(
async
def
process_chat_template
(
args_chat_template
:
Path
|
str
|
None
,
engine_client
:
EngineClient
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
)
->
str
|
None
:
resolved_chat_template
=
load_chat_template
(
args_chat_template
)
if
resolved_chat_template
is
not
None
:
...
...
@@ -305,7 +305,7 @@ async def process_chat_template(
tokenizer
=
tokenizer
,
chat_template
=
None
,
tools
=
None
,
model_config
=
renderer_config
.
model_config
,
model_config
=
model_config
,
)
if
hf_chat_template
!=
resolved_chat_template
:
...
...
@@ -314,6 +314,6 @@ async def process_chat_template(
"It is different from official chat template '%s'. "
"This discrepancy may lead to performance degradation."
,
resolved_chat_template
,
renderer_config
.
model_config
.
model
,
model_config
.
model
,
)
return
resolved_chat_template
vllm/inputs/preprocess.py
View file @
e83b7e37
...
...
@@ -6,7 +6,7 @@ from typing import Any, cast
from
typing_extensions
import
assert_never
from
vllm.config
import
Renderer
Config
from
vllm.config
import
Model
Config
from
vllm.logger
import
init_logger
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal.cache
import
BaseMultiModalProcessorCache
...
...
@@ -45,15 +45,14 @@ logger = init_logger(__name__)
class
InputPreprocessor
:
def
__init__
(
self
,
renderer
_config
:
Renderer
Config
,
model
_config
:
Model
Config
,
tokenizer
:
TokenizerLike
|
None
,
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
mm_processor_cache
:
BaseMultiModalProcessorCache
|
None
=
None
,
)
->
None
:
super
().
__init__
()
self
.
renderer_config
=
renderer_config
self
.
model_config
=
renderer_config
.
model_config
self
.
model_config
=
model_config
self
.
tokenizer
=
tokenizer
self
.
mm_registry
=
mm_registry
self
.
mm_processor_cache
=
mm_processor_cache
...
...
@@ -232,7 +231,7 @@ class InputPreprocessor:
def
_get_mm_processor
(
self
)
->
BaseMultiModalProcessor
:
if
not
hasattr
(
self
,
"_mm_processor"
):
self
.
_mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
renderer
_config
,
self
.
model
_config
,
tokenizer
=
self
.
tokenizer
,
cache
=
self
.
mm_processor_cache
,
)
...
...
vllm/model_executor/models/adapters.py
View file @
e83b7e37
...
...
@@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
renderer
_config
=
model
.
vllm_config
.
renderer
_config
model
_config
=
model
.
vllm_config
.
model
_config
quant_config
=
model
.
vllm_config
.
quant_config
text_config
=
model
.
config
.
get_text_config
()
...
...
@@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
from
vllm.tokenizers
import
get_tokenizer
tokenizer
=
get_tokenizer
(
renderer
_config
.
tokenizer
,
revision
=
renderer
_config
.
tokenizer_revision
,
tokenizer_mode
=
renderer
_config
.
tokenizer_mode
,
trust_remote_code
=
renderer
_config
.
trust_remote_code
,
model
_config
.
tokenizer
,
revision
=
model
_config
.
tokenizer_revision
,
tokenizer_mode
=
model
_config
.
tokenizer_mode
,
trust_remote_code
=
model
_config
.
trust_remote_code
,
)
false_id
=
tokenizer
.
convert_tokens_to_ids
(
tokens
[
0
])
...
...
@@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
renderer
_config
=
model
.
vllm_config
.
renderer
_config
model
_config
=
model
.
vllm_config
.
model
_config
quant_config
=
model
.
vllm_config
.
quant_config
text_config
=
model
.
config
.
get_text_config
()
...
...
@@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from
vllm.tokenizers
import
get_tokenizer
tokenizer
=
get_tokenizer
(
renderer
_config
.
tokenizer
,
revision
=
renderer
_config
.
tokenizer_revision
,
tokenizer_mode
=
renderer
_config
.
tokenizer_mode
,
trust_remote_code
=
renderer
_config
.
trust_remote_code
,
model
_config
.
tokenizer
,
revision
=
model
_config
.
tokenizer_revision
,
tokenizer_mode
=
model
_config
.
tokenizer_mode
,
trust_remote_code
=
model
_config
.
trust_remote_code
,
)
token_ids
=
[
tokenizer
.
convert_tokens_to_ids
(
t
)
for
t
in
tokens
]
...
...
vllm/model_executor/models/deepseek_ocr.py
View file @
e83b7e37
...
...
@@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
projector_config
=
config
.
projector_config
self
.
text_config
=
config
.
text_config
renderer
_config
=
vllm_config
.
renderer
_config
tokenizer
=
cached_tokenizer_from_config
(
renderer
_config
)
model
_config
=
vllm_config
.
model
_config
tokenizer
=
cached_tokenizer_from_config
(
model
_config
)
self
.
image_token_id
=
tokenizer
.
vocab
[
_IMAGE_TOKEN
]
self
.
sam_model
=
build_sam_vit_b
()
...
...
vllm/model_executor/models/deepseek_vl2.py
View file @
e83b7e37
...
...
@@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self
.
projector_config
=
config
.
projector_config
self
.
text_config
=
config
.
text_config
renderer
_config
=
vllm_config
.
renderer
_config
tokenizer
=
cached_tokenizer_from_config
(
renderer
_config
)
model
_config
=
vllm_config
.
model
_config
tokenizer
=
cached_tokenizer_from_config
(
model
_config
)
self
.
image_token_id
:
int
=
tokenizer
.
vocab
[
_IMAGE_TOKEN
]
self
.
vision
=
self
.
_init_vision_module
(
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment