Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ebb3930d
Unverified
Commit
ebb3930d
authored
Apr 29, 2025
by
Cyrus Leung
Committed by
GitHub
Apr 29, 2025
Browse files
[Misc] Move config fields to MultiModalConfig (#17343)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
cde384cd
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
62 additions
and
36 deletions
+62
-36
vllm/config.py
vllm/config.py
+42
-15
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+4
-12
vllm/inputs/registry.py
vllm/inputs/registry.py
+4
-2
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+3
-2
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+2
-1
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+2
-1
vllm/v1/engine/mm_input_cache.py
vllm/v1/engine/mm_input_cache.py
+4
-1
vllm/v1/engine/processor.py
vllm/v1/engine/processor.py
+1
-2
No files found.
vllm/config.py
View file @
ebb3930d
...
...
@@ -263,6 +263,10 @@ class ModelConfig:
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data items per modality
per prompt. Only applicable for multimodal models.
mm_processor_kwargs: Overrides for the multi-modal processor obtained
from `AutoProcessor.from_pretrained`.
disable_mm_preprocessor_cache: If True, disable caching of the
processed multi-modal inputs.
use_async_output_proc: Whether to use async output processor.
Defaults to True.
config_format: The config format which shall be loaded.
...
...
@@ -273,10 +277,6 @@ class ModelConfig:
hf_overrides: If a dictionary, contains arguments to be forwarded to the
HuggingFace config. If a callable, it is called to update the
HuggingFace config.
mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor.
disable_mm_preprocessor_cache: If true, then disables caching of the
multi-modal preprocessor/mapper. (not recommended)
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that
...
...
@@ -320,7 +320,6 @@ class ModelConfig:
factors
.
append
(
self
.
max_logprobs
)
factors
.
append
(
self
.
disable_sliding_window
)
factors
.
append
(
self
.
trust_remote_code
)
factors
.
append
(
self
.
mm_processor_kwargs
)
factors
.
append
(
self
.
generation_config
)
factors
.
append
(
self
.
model_impl
)
factors
.
append
(
self
.
override_generation_config
)
...
...
@@ -359,12 +358,12 @@ class ModelConfig:
skip_tokenizer_init
:
bool
=
False
,
served_model_name
:
Optional
[
Union
[
str
,
list
[
str
]]]
=
None
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
False
,
use_async_output_proc
:
bool
=
True
,
config_format
:
ConfigFormat
=
ConfigFormat
.
AUTO
,
hf_token
:
Optional
[
Union
[
bool
,
str
]]
=
None
,
hf_overrides
:
Optional
[
HfOverrides
]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
disable_mm_preprocessor_cache
:
bool
=
False
,
override_neuron_config
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
override_pooler_config
:
Optional
[
"PoolerConfig"
]
=
None
,
logits_processor_pattern
:
Optional
[
str
]
=
None
,
...
...
@@ -469,8 +468,6 @@ class ModelConfig:
self
.
model
,
hf_token
=
hf_token
,
revision
=
revision
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_config
,
dtype
)
self
.
use_async_output_proc
=
use_async_output_proc
self
.
mm_processor_kwargs
=
mm_processor_kwargs
self
.
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
# Set enforce_eager to False if the value is unset.
if
self
.
enforce_eager
is
None
:
...
...
@@ -515,7 +512,10 @@ class ModelConfig:
self
.
served_model_name
=
get_served_model_name
(
model
,
served_model_name
)
self
.
multimodal_config
=
self
.
_init_multimodal_config
(
limit_mm_per_prompt
)
limit_mm_per_prompt
=
limit_mm_per_prompt
,
mm_processor_kwargs
=
mm_processor_kwargs
,
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
)
if
not
self
.
skip_tokenizer_init
:
self
.
_verify_tokenizer_mode
()
...
...
@@ -581,14 +581,27 @@ class ModelConfig:
self
.
tokenizer
=
s3_tokenizer
.
dir
def
_init_multimodal_config
(
self
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]]
self
,
limit_mm_per_prompt
:
Optional
[
dict
[
str
,
int
]],
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]],
disable_mm_preprocessor_cache
:
bool
,
)
->
Optional
[
"MultiModalConfig"
]:
if
self
.
registry
.
is_multimodal_model
(
self
.
architectures
):
return
MultiModalConfig
(
limit_per_prompt
=
limit_mm_per_prompt
or
{})
return
MultiModalConfig
(
limit_per_prompt
=
limit_mm_per_prompt
or
{},
mm_processor_kwargs
=
mm_processor_kwargs
or
{},
disable_mm_preprocessor_cache
=
disable_mm_preprocessor_cache
,
)
if
limit_mm_per_prompt
:
raise
ValueError
(
"`limit_mm_per_prompt` is only supported for "
"multimodal models."
)
if
mm_processor_kwargs
:
raise
ValueError
(
"`mm_processor_kwargs` is only supported for "
"multimodal models."
)
if
disable_mm_preprocessor_cache
:
raise
ValueError
(
"`disable_mm_preprocessor_cache` is only "
"supported for multimodal models."
)
return
None
...
...
@@ -2776,7 +2789,23 @@ class MultiModalConfig:
Defaults to 1 (V0) or 999 (V1) for each modality.
For example, to allow up to 16 images and 2 videos per prompt:
``{"images": 16, "videos": 2}``
:code:`{"images": 16, "videos": 2}`
"""
mm_processor_kwargs
:
Optional
[
dict
[
str
,
object
]]
=
None
"""
Overrides for the multi-modal processor obtained from
:meth:`transformers.AutoProcessor.from_pretrained`.
The available overrides depend on the model that is being run.
For example, for Phi-3-Vision:
:code:`{"num_crops": 4}`.
"""
disable_mm_preprocessor_cache
:
bool
=
False
"""
If :code:`True`, disable caching of the processed multi-modal inputs.
"""
def
compute_hash
(
self
)
->
str
:
...
...
@@ -4080,8 +4109,6 @@ class VllmConfig:
f
"enable_prefix_caching=
{
self
.
cache_config
.
enable_prefix_caching
}
, "
f
"chunked_prefill_enabled=
{
self
.
scheduler_config
.
chunked_prefill_enabled
}
, "
# noqa
f
"use_async_output_proc=
{
self
.
model_config
.
use_async_output_proc
}
, "
f
"disable_mm_preprocessor_cache=
{
self
.
model_config
.
disable_mm_preprocessor_cache
!
r
}
, "
# noqa
f
"mm_processor_kwargs=
{
self
.
model_config
.
mm_processor_kwargs
}
, "
f
"pooler_config=
{
self
.
model_config
.
pooler_config
!
r
}
, "
f
"compilation_config=
{
self
.
compilation_config
!
r
}
"
)
...
...
vllm/engine/arg_utils.py
View file @
ebb3930d
...
...
@@ -672,20 +672,12 @@ class EngineArgs:
)
multimodal_group
.
add_argument
(
'--limit-mm-per-prompt'
,
**
multimodal_kwargs
[
"limit_per_prompt"
])
parser
.
add_argument
(
multimodal_group
.
add_argument
(
'--mm-processor-kwargs'
,
default
=
None
,
type
=
json
.
loads
,
help
=
(
'Overrides for the multi-modal processor obtained from '
'``AutoProcessor.from_pretrained``. The available overrides '
'depend on the model that is being run.'
'For example, for Phi-3-Vision: ``{"num_crops": 4}``.'
))
parser
.
add_argument
(
**
multimodal_kwargs
[
"mm_processor_kwargs"
])
multimodal_group
.
add_argument
(
'--disable-mm-preprocessor-cache'
,
action
=
'store_true'
,
help
=
'If True, disable caching of the processed multi-modal '
'inputs.'
)
**
multimodal_kwargs
[
"disable_mm_preprocessor_cache"
])
# LoRA related configs
lora_kwargs
=
get_kwargs
(
LoRAConfig
)
...
...
vllm/inputs/registry.py
View file @
ebb3930d
...
...
@@ -101,7 +101,8 @@ class InputContext:
Initialize a HuggingFace-like processor class, merging the
keyword arguments with those in the model's configuration.
"""
base_kwargs
=
self
.
model_config
.
mm_processor_kwargs
mm_config
=
self
.
model_config
.
get_multimodal_config
()
base_kwargs
=
mm_config
.
mm_processor_kwargs
if
base_kwargs
is
None
:
base_kwargs
=
{}
...
...
@@ -139,7 +140,8 @@ class InputProcessingContext(InputContext):
"""
assert
callable
(
hf_processor
)
base_kwargs
=
self
.
model_config
.
mm_processor_kwargs
mm_config
=
self
.
model_config
.
get_multimodal_config
()
base_kwargs
=
mm_config
.
mm_processor_kwargs
if
base_kwargs
is
None
:
base_kwargs
=
{}
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
ebb3930d
...
...
@@ -774,8 +774,9 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo):
size
:
Optional
[
dict
[
str
,
int
]]
=
None
,
**
kwargs
:
object
,
):
if
self
.
ctx
.
model_config
.
mm_processor_kwargs
:
kwargs
.
update
(
self
.
ctx
.
model_config
.
mm_processor_kwargs
)
mm_config
=
self
.
ctx
.
model_config
.
get_multimodal_config
()
if
mm_config
.
mm_processor_kwargs
:
kwargs
.
update
(
mm_config
.
mm_processor_kwargs
)
if
min_pixels
is
not
None
:
kwargs
[
"min_pixels"
]
=
min_pixels
...
...
vllm/multimodal/registry.py
View file @
ebb3930d
...
...
@@ -262,7 +262,8 @@ class MultiModalRegistry:
if
tokenizer
is
None
:
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
if
disable_cache
is
None
:
disable_cache
=
model_config
.
disable_mm_preprocessor_cache
mm_config
=
model_config
.
get_multimodal_config
()
disable_cache
=
mm_config
.
disable_mm_preprocessor_cache
model_cls
=
self
.
_get_model_cls
(
model_config
)
factories
=
self
.
_processor_factories
[
model_cls
]
...
...
vllm/transformers_utils/processor.py
View file @
ebb3930d
...
...
@@ -33,7 +33,8 @@ class HashableList(list):
def
_merge_mm_kwargs
(
model_config
:
"ModelConfig"
,
**
kwargs
):
base_kwargs
=
model_config
.
mm_processor_kwargs
mm_config
=
model_config
.
get_multimodal_config
()
base_kwargs
=
mm_config
.
mm_processor_kwargs
if
base_kwargs
is
None
:
base_kwargs
=
{}
...
...
vllm/v1/engine/mm_input_cache.py
View file @
ebb3930d
...
...
@@ -33,7 +33,10 @@ from vllm.utils import is_list_of
class
MirroredProcessingCache
:
def
__init__
(
self
,
model_config
):
self
.
use_cache
=
not
model_config
.
disable_mm_preprocessor_cache
mm_config
=
model_config
.
multimodal_config
disable_mm_preprocessor_cache
=
mm_config
is
not
None
and
\
not
mm_config
.
disable_mm_preprocessor_cache
self
.
use_cache
=
not
disable_mm_preprocessor_cache
self
.
mm_cache
=
ProcessingCache
.
get_lru_cache
(
VLLM_MM_INPUT_CACHE_GIB
,
MultiModalKwargs
)
...
...
vllm/v1/engine/processor.py
View file @
ebb3930d
...
...
@@ -51,8 +51,7 @@ class Processor:
self
.
mm_input_cache_client
=
MirroredProcessingCache
(
self
.
model_config
)
# Multi-modal hasher (for images)
self
.
use_hash
=
(
not
self
.
model_config
.
disable_mm_preprocessor_cache
)
or
\
self
.
use_hash
=
self
.
mm_input_cache_client
.
use_cache
or
\
self
.
cache_config
.
enable_prefix_caching
def
_validate_logprobs
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment