Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
aec9674d
Unverified
Commit
aec9674d
authored
Apr 28, 2025
by
Cyrus Leung
Committed by
GitHub
Apr 28, 2025
Browse files
[Core] Remove legacy input mapper/processor from V0 (#15686)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
7fcc4223
Changes
21
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
142 additions
and
1325 deletions
+142
-1325
vllm/core/scheduler.py
vllm/core/scheduler.py
+0
-1
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+1
-2
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+3
-10
vllm/inputs/__init__.py
vllm/inputs/__init__.py
+3
-5
vllm/inputs/data.py
vllm/inputs/data.py
+2
-159
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+11
-49
vllm/inputs/registry.py
vllm/inputs/registry.py
+23
-298
vllm/multimodal/__init__.py
vllm/multimodal/__init__.py
+1
-3
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+1
-22
vllm/multimodal/base.py
vllm/multimodal/base.py
+17
-267
vllm/multimodal/image.py
vllm/multimodal/image.py
+1
-79
vllm/multimodal/registry.py
vllm/multimodal/registry.py
+42
-233
vllm/multimodal/video.py
vllm/multimodal/video.py
+2
-69
vllm/sequence.py
vllm/sequence.py
+17
-30
vllm/transformers_utils/processor.py
vllm/transformers_utils/processor.py
+0
-35
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+5
-20
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+2
-0
vllm/worker/hpu_model_runner.py
vllm/worker/hpu_model_runner.py
+3
-9
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+5
-17
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_model_runner.py
+3
-17
No files found.
vllm/core/scheduler.py
View file @
aec9674d
...
@@ -1596,7 +1596,6 @@ class Scheduler:
...
@@ -1596,7 +1596,6 @@ class Scheduler:
multi_modal_placeholders
=
(
multi_modal_placeholders
=
(
seq_group
.
multi_modal_placeholders
seq_group
.
multi_modal_placeholders
if
scheduler_outputs
.
num_prefill_groups
>
0
else
None
),
if
scheduler_outputs
.
num_prefill_groups
>
0
else
None
),
mm_processor_kwargs
=
seq_group
.
mm_processor_kwargs
,
prompt_adapter_request
=
seq_group
.
prompt_adapter_request
,
prompt_adapter_request
=
seq_group
.
prompt_adapter_request
,
)
)
else
:
else
:
...
...
vllm/engine/async_llm_engine.py
View file @
aec9674d
...
@@ -493,12 +493,11 @@ class _AsyncLLMEngine(LLMEngine):
...
@@ -493,12 +493,11 @@ class _AsyncLLMEngine(LLMEngine):
tokenizer
=
await
self
.
get_tokenizer_async
(
lora_request
)
tokenizer
=
await
self
.
get_tokenizer_async
(
lora_request
)
self
.
_validate_token_prompt
(
prompt
,
tokenizer
=
tokenizer
)
self
.
_validate_token_prompt
(
prompt
,
tokenizer
=
tokenizer
)
pre
processed_inputs
=
await
self
.
input_preprocessor
.
preprocess_async
(
processed_inputs
=
await
self
.
input_preprocessor
.
preprocess_async
(
prompt
,
prompt
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
)
processed_inputs
=
self
.
input_processor
(
preprocessed_inputs
)
if
isinstance
(
params
,
SamplingParams
)
and
\
if
isinstance
(
params
,
SamplingParams
)
and
\
params
.
guided_decoding
is
not
None
:
params
.
guided_decoding
is
not
None
:
...
...
vllm/engine/llm_engine.py
View file @
aec9674d
...
@@ -29,8 +29,7 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group
...
@@ -29,8 +29,7 @@ from vllm.engine.output_processor.util import create_output_by_sequence_group
from
vllm.entrypoints.openai.logits_processors
import
(
from
vllm.entrypoints.openai.logits_processors
import
(
get_logits_processors
as
get_openai_logits_processors
)
get_logits_processors
as
get_openai_logits_processors
)
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
from
vllm.inputs
import
ProcessorInputs
,
PromptType
,
SingletonInputs
PromptType
,
SingletonInputs
)
from
vllm.inputs.parse
import
is_token_prompt
,
split_enc_dec_inputs
from
vllm.inputs.parse
import
is_token_prompt
,
split_enc_dec_inputs
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
...
@@ -213,7 +212,6 @@ class LLMEngine:
...
@@ -213,7 +212,6 @@ class LLMEngine:
log_stats
:
bool
,
log_stats
:
bool
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
input_registry
:
InputRegistry
=
INPUT_REGISTRY
,
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
use_cached_outputs
:
bool
=
False
,
use_cached_outputs
:
bool
=
False
,
)
->
None
:
)
->
None
:
...
@@ -274,11 +272,7 @@ class LLMEngine:
...
@@ -274,11 +272,7 @@ class LLMEngine:
self
.
tokenizer
,
self
.
tokenizer
,
mm_registry
)
mm_registry
)
self
.
input_registry
=
input_registry
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
)
self
.
input_processor
=
input_registry
.
create_input_processor
(
self
.
model_config
)
self
.
model_executor
=
executor_class
(
vllm_config
=
vllm_config
,
)
if
self
.
model_config
.
runner_type
!=
"pooling"
:
if
self
.
model_config
.
runner_type
!=
"pooling"
:
self
.
_initialize_kv_caches
()
self
.
_initialize_kv_caches
()
...
@@ -762,12 +756,11 @@ class LLMEngine:
...
@@ -762,12 +756,11 @@ class LLMEngine:
prompt
,
prompt
,
tokenizer
=
self
.
get_tokenizer
(
lora_request
=
lora_request
))
tokenizer
=
self
.
get_tokenizer
(
lora_request
=
lora_request
))
pre
processed_inputs
=
self
.
input_preprocessor
.
preprocess
(
processed_inputs
=
self
.
input_preprocessor
.
preprocess
(
prompt
,
prompt
,
lora_request
=
lora_request
,
lora_request
=
lora_request
,
prompt_adapter_request
=
prompt_adapter_request
,
prompt_adapter_request
=
prompt_adapter_request
,
)
)
processed_inputs
=
self
.
input_processor
(
preprocessed_inputs
)
self
.
_add_processed_request
(
self
.
_add_processed_request
(
request_id
=
request_id
,
request_id
=
request_id
,
...
...
vllm/inputs/__init__.py
View file @
aec9674d
...
@@ -2,10 +2,9 @@
...
@@ -2,10 +2,9 @@
from
.data
import
(
DecoderOnlyInputs
,
EncoderDecoderInputs
,
from
.data
import
(
DecoderOnlyInputs
,
EncoderDecoderInputs
,
ExplicitEncoderDecoderPrompt
,
ProcessorInputs
,
PromptType
,
ExplicitEncoderDecoderPrompt
,
ProcessorInputs
,
PromptType
,
SingletonInputs
,
SingletonInputsAdapter
,
SingletonPrompt
,
SingletonInputs
,
SingletonPrompt
,
TextPrompt
,
TokenInputs
,
TextPrompt
,
TokenInputs
,
TokensPrompt
,
TokensPrompt
,
build_explicit_enc_dec_prompt
,
build_explicit_enc_dec_prompt
,
to_enc_dec_tuple_list
,
to_enc_dec_tuple_list
,
token_inputs
,
zip_enc_dec_prompts
)
token_inputs
,
zip_enc_dec_prompts
)
from
.registry
import
(
DummyData
,
InputContext
,
InputProcessingContext
,
from
.registry
import
(
DummyData
,
InputContext
,
InputProcessingContext
,
InputRegistry
)
InputRegistry
)
...
@@ -27,7 +26,6 @@ __all__ = [
...
@@ -27,7 +26,6 @@ __all__ = [
"EncoderDecoderInputs"
,
"EncoderDecoderInputs"
,
"ProcessorInputs"
,
"ProcessorInputs"
,
"SingletonInputs"
,
"SingletonInputs"
,
"SingletonInputsAdapter"
,
"build_explicit_enc_dec_prompt"
,
"build_explicit_enc_dec_prompt"
,
"to_enc_dec_tuple_list"
,
"to_enc_dec_tuple_list"
,
"zip_enc_dec_prompts"
,
"zip_enc_dec_prompts"
,
...
...
vllm/inputs/data.py
View file @
aec9674d
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Iterable
from
collections.abc
import
Iterable
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
typing
import
TYPE_CHECKING
,
Any
,
Generic
,
Literal
,
Optional
,
Union
,
cast
from
typing
import
TYPE_CHECKING
,
Any
,
Generic
,
Literal
,
Optional
,
Union
,
cast
import
torch
from
typing_extensions
import
NotRequired
,
TypedDict
,
TypeVar
from
typing_extensions
import
NotRequired
,
TypedDict
,
TypeVar
,
assert_never
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.multimodal
import
(
MultiModalDataDict
,
MultiModalKwargs
,
from
vllm.multimodal.inputs
import
MultiModalDataDict
,
MultiModalInputs
MultiModalPlaceholderDict
)
from
vllm.multimodal.inputs
import
MultiModalInputs
class
TextPrompt
(
TypedDict
):
class
TextPrompt
(
TypedDict
):
...
@@ -147,46 +141,11 @@ class TokenInputs(TypedDict):
...
@@ -147,46 +141,11 @@ class TokenInputs(TypedDict):
The original prompt text corresponding to the token IDs, if available.
The original prompt text corresponding to the token IDs, if available.
"""
"""
multi_modal_data
:
NotRequired
[
"MultiModalDataDict"
]
"""
Optional multi-modal data to pass to the model,
if the model supports it.
"""
multi_modal_inputs
:
NotRequired
[
"MultiModalKwargs"
]
"""
Optional multi-modal inputs to pass to the model,
if the model supports it.
"""
multi_modal_placeholders
:
NotRequired
[
"MultiModalPlaceholderDict"
]
"""
Placeholder ranges for the multi-modal data.
"""
multi_modal_hashes
:
NotRequired
[
list
[
str
]]
"""
The hashes of the multi-modal data.
"""
mm_processor_kwargs
:
NotRequired
[
dict
[
str
,
Any
]]
"""
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
have registered mappers etc for the model being considered, we attempt
to pass the mm_processor_kwargs to each of them.
"""
def
token_inputs
(
def
token_inputs
(
prompt_token_ids
:
list
[
int
],
prompt_token_ids
:
list
[
int
],
token_type_ids
:
Optional
[
list
[
int
]]
=
None
,
token_type_ids
:
Optional
[
list
[
int
]]
=
None
,
prompt
:
Optional
[
str
]
=
None
,
prompt
:
Optional
[
str
]
=
None
,
multi_modal_data
:
Optional
[
"MultiModalDataDict"
]
=
None
,
multi_modal_inputs
:
Optional
[
"MultiModalKwargs"
]
=
None
,
multi_modal_hashes
:
Optional
[
list
[
str
]]
=
None
,
multi_modal_placeholders
:
Optional
[
"MultiModalPlaceholderDict"
]
=
None
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
TokenInputs
:
)
->
TokenInputs
:
"""Construct :class:`TokenInputs` from optional values."""
"""Construct :class:`TokenInputs` from optional values."""
inputs
=
TokenInputs
(
type
=
"token"
,
prompt_token_ids
=
prompt_token_ids
)
inputs
=
TokenInputs
(
type
=
"token"
,
prompt_token_ids
=
prompt_token_ids
)
...
@@ -195,16 +154,6 @@ def token_inputs(
...
@@ -195,16 +154,6 @@ def token_inputs(
inputs
[
"prompt"
]
=
prompt
inputs
[
"prompt"
]
=
prompt
if
token_type_ids
is
not
None
:
if
token_type_ids
is
not
None
:
inputs
[
"token_type_ids"
]
=
token_type_ids
inputs
[
"token_type_ids"
]
=
token_type_ids
if
multi_modal_data
is
not
None
:
inputs
[
"multi_modal_data"
]
=
multi_modal_data
if
multi_modal_inputs
is
not
None
:
inputs
[
"multi_modal_inputs"
]
=
multi_modal_inputs
if
multi_modal_hashes
is
not
None
:
inputs
[
"multi_modal_hashes"
]
=
multi_modal_hashes
if
multi_modal_placeholders
is
not
None
:
inputs
[
"multi_modal_placeholders"
]
=
multi_modal_placeholders
if
mm_processor_kwargs
is
not
None
:
inputs
[
"mm_processor_kwargs"
]
=
mm_processor_kwargs
return
inputs
return
inputs
...
@@ -237,112 +186,6 @@ A processed :class:`SingletonPrompt` which can be passed to
...
@@ -237,112 +186,6 @@ A processed :class:`SingletonPrompt` which can be passed to
:class:`vllm.sequence.Sequence`.
:class:`vllm.sequence.Sequence`.
"""
"""
@
dataclass
class
SingletonInputsAdapter
:
"""
Unified interface to access the components of :class:`SingletonInputs`.
"""
inputs
:
SingletonInputs
@
cached_property
def
prompt
(
self
)
->
Optional
[
str
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"prompt"
)
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
prompt_token_ids
(
self
)
->
list
[
int
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"prompt_token_ids"
,
[])
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
token_type_ids
(
self
)
->
list
[
int
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"token_type_ids"
,
[])
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
prompt_embeds
(
self
)
->
Optional
[
torch
.
Tensor
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
None
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
multi_modal_data
(
self
)
->
"MultiModalDataDict"
:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_data"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"mm_kwargs"
,
{})
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
multi_modal_inputs
(
self
)
->
Union
[
dict
,
"MultiModalKwargs"
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_inputs"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"mm_kwargs"
,
{})
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
multi_modal_hashes
(
self
)
->
list
[
str
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_hashes"
,
[])
if
inputs
[
"type"
]
==
"multimodal"
:
# only the case when we use MultiModalInputs
return
inputs
.
get
(
"mm_hashes"
,
[])
# type: ignore[return-value]
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
multi_modal_placeholders
(
self
)
->
"MultiModalPlaceholderDict"
:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_placeholders"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"mm_placeholders"
,
{})
assert_never
(
inputs
)
# type: ignore[arg-type]
@
cached_property
def
mm_processor_kwargs
(
self
)
->
dict
[
str
,
Any
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"mm_processor_kwargs"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
{}
assert_never
(
inputs
)
# type: ignore[arg-type]
ProcessorInputs
=
Union
[
DecoderOnlyInputs
,
EncoderDecoderInputs
]
ProcessorInputs
=
Union
[
DecoderOnlyInputs
,
EncoderDecoderInputs
]
"""
"""
The inputs to :data:`vllm.inputs.InputProcessor`.
The inputs to :data:`vllm.inputs.InputProcessor`.
...
...
vllm/inputs/preprocess.py
View file @
aec9674d
...
@@ -223,28 +223,6 @@ class InputPreprocessor:
...
@@ -223,28 +223,6 @@ class InputPreprocessor:
lora_request
=
lora_request
,
lora_request
=
lora_request
,
add_special_tokens
=
add_special_tokens
)
add_special_tokens
=
add_special_tokens
)
def
_can_process_multimodal
(
self
)
->
bool
:
model_config
=
self
.
model_config
if
not
model_config
.
is_multimodal_model
:
raise
ValueError
(
"Your model does not support multi-modal inputs"
)
# Interim measure so we can handle models that have yet to be
# updated to use the new multi-modal processor
can_process_multimodal
=
self
.
mm_registry
.
has_processor
(
model_config
)
if
not
can_process_multimodal
:
from
vllm.model_executor.models.registry
import
_VLLM_MODELS
if
not
any
(
arch
in
_VLLM_MODELS
for
arch
in
model_config
.
architectures
):
logger
.
warning_once
(
"Your model uses the legacy input pipeline, which will be "
"removed in an upcoming release. "
"Please upgrade to the new multi-modal processing pipeline "
"(https://docs.vllm.ai/en/latest/design/mm_processing.html)"
)
return
can_process_multimodal
def
_process_multimodal
(
def
_process_multimodal
(
self
,
self
,
prompt
:
Union
[
str
,
list
[
int
]],
prompt
:
Union
[
str
,
list
[
int
]],
...
@@ -258,8 +236,7 @@ class InputPreprocessor:
...
@@ -258,8 +236,7 @@ class InputPreprocessor:
returning the corresponding token IDs and metadata.
returning the corresponding token IDs and metadata.
"""
"""
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# initialized without a tokenizer while using also multi-modal
# initialized without a tokenizer while using also multi-modal input
# input.
if
not
self
.
tokenizer
:
if
not
self
.
tokenizer
:
tokenizer
=
object
()
# Dummy
tokenizer
=
object
()
# Dummy
else
:
else
:
...
@@ -285,8 +262,7 @@ class InputPreprocessor:
...
@@ -285,8 +262,7 @@ class InputPreprocessor:
)
->
MultiModalInputs
:
)
->
MultiModalInputs
:
"""Async version of :meth:`_process_multimodal`."""
"""Async version of :meth:`_process_multimodal`."""
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# At the moment on model (PrithviGeoSpatialMAE) requires to be
# initialized without a tokenizer while using also multi-modal
# initialized without a tokenizer while using also multi-modal input
# input.
if
not
self
.
tokenizer
:
if
not
self
.
tokenizer
:
tokenizer
=
object
()
# Dummy
tokenizer
=
object
()
# Dummy
else
:
else
:
...
@@ -343,7 +319,7 @@ class InputPreprocessor:
...
@@ -343,7 +319,7 @@ class InputPreprocessor:
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
()
:
if
multi_modal_data
is
not
None
:
return
self
.
_process_multimodal
(
return
self
.
_process_multimodal
(
prompt_token_ids
,
prompt_token_ids
,
multi_modal_data
,
multi_modal_data
,
...
@@ -355,8 +331,6 @@ class InputPreprocessor:
...
@@ -355,8 +331,6 @@ class InputPreprocessor:
return
token_inputs
(
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
token_type_ids
=
token_type_ids
,
token_type_ids
=
token_type_ids
,
multi_modal_data
=
multi_modal_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
)
if
parsed
[
"type"
]
==
"text"
:
if
parsed
[
"type"
]
==
"text"
:
...
@@ -366,7 +340,7 @@ class InputPreprocessor:
...
@@ -366,7 +340,7 @@ class InputPreprocessor:
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
()
:
if
multi_modal_data
is
not
None
:
return
self
.
_process_multimodal
(
return
self
.
_process_multimodal
(
prompt_text
,
prompt_text
,
multi_modal_data
,
multi_modal_data
,
...
@@ -383,8 +357,6 @@ class InputPreprocessor:
...
@@ -383,8 +357,6 @@ class InputPreprocessor:
return
token_inputs
(
return
token_inputs
(
prompt
=
prompt_text
,
prompt
=
prompt_text
,
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
)
assert_never
(
parsed
)
assert_never
(
parsed
)
...
@@ -417,7 +389,7 @@ class InputPreprocessor:
...
@@ -417,7 +389,7 @@ class InputPreprocessor:
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
()
:
if
multi_modal_data
is
not
None
:
return
await
self
.
_process_multimodal_async
(
return
await
self
.
_process_multimodal_async
(
prompt_token_ids
,
prompt_token_ids
,
multi_modal_data
,
multi_modal_data
,
...
@@ -426,11 +398,7 @@ class InputPreprocessor:
...
@@ -426,11 +398,7 @@ class InputPreprocessor:
return_mm_hashes
=
return_mm_hashes
,
return_mm_hashes
=
return_mm_hashes
,
)
)
return
token_inputs
(
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
)
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
if
parsed
[
"type"
]
==
"text"
:
if
parsed
[
"type"
]
==
"text"
:
text_content
=
parsed
[
"content"
]
text_content
=
parsed
[
"content"
]
...
@@ -439,7 +407,7 @@ class InputPreprocessor:
...
@@ -439,7 +407,7 @@ class InputPreprocessor:
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
()
:
if
multi_modal_data
is
not
None
:
return
await
self
.
_process_multimodal_async
(
return
await
self
.
_process_multimodal_async
(
prompt_text
,
prompt_text
,
multi_modal_data
,
multi_modal_data
,
...
@@ -456,8 +424,6 @@ class InputPreprocessor:
...
@@ -456,8 +424,6 @@ class InputPreprocessor:
return
token_inputs
(
return
token_inputs
(
prompt
=
prompt_text
,
prompt
=
prompt_text
,
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
)
assert_never
(
parsed
)
assert_never
(
parsed
)
...
@@ -594,15 +560,13 @@ class InputPreprocessor:
...
@@ -594,15 +560,13 @@ class InputPreprocessor:
decoder_inputs
=
self
.
_prompt_to_llm_inputs
(
decoder_input
)
decoder_inputs
=
self
.
_prompt_to_llm_inputs
(
decoder_input
)
# For multimodal model, override decoder prompt from processor
# For multimodal model, override decoder prompt from processor
# with explicit decoder prompt.
# with explicit decoder prompt.
if
self
.
model_config
.
is_multimodal_model
and
(
if
self
.
model_config
.
is_multimodal_model
:
self
.
_can_process_multimodal
()):
encoder_inputs
,
decoder_inputs
=
(
encoder_inputs
,
decoder_inputs
=
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
encoder_inputs
,
decoder_inputs
))
encoder_inputs
,
decoder_inputs
))
else
:
else
:
inputs
=
self
.
_prompt_to_llm_inputs
(
prompt
)
inputs
=
self
.
_prompt_to_llm_inputs
(
prompt
)
if
self
.
model_config
.
is_multimodal_model
and
(
if
self
.
model_config
.
is_multimodal_model
:
self
.
_can_process_multimodal
()):
# Encoder-Decoder Multimodal model
# Encoder-Decoder Multimodal model
encoder_inputs
,
decoder_inputs
=
(
encoder_inputs
,
decoder_inputs
=
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
...
@@ -637,15 +601,13 @@ class InputPreprocessor:
...
@@ -637,15 +601,13 @@ class InputPreprocessor:
# For multimodal model, override decoder prompt from processor
# For multimodal model, override decoder prompt from processor
# with explicit decoder prompt.
# with explicit decoder prompt.
if
self
.
model_config
.
is_multimodal_model
and
(
if
self
.
model_config
.
is_multimodal_model
:
self
.
_can_process_multimodal
()):
encoder_inputs
,
decoder_inputs
=
(
encoder_inputs
,
decoder_inputs
=
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
encoder_inputs
,
decoder_inputs
))
encoder_inputs
,
decoder_inputs
))
else
:
else
:
inputs
=
await
self
.
_prompt_to_llm_inputs_async
(
prompt
)
inputs
=
await
self
.
_prompt_to_llm_inputs_async
(
prompt
)
if
self
.
model_config
.
is_multimodal_model
and
(
if
self
.
model_config
.
is_multimodal_model
:
self
.
_can_process_multimodal
()):
# Encoder-Decoder Multimodal model
# Encoder-Decoder Multimodal model
encoder_inputs
,
decoder_inputs
=
(
encoder_inputs
,
decoder_inputs
=
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
self
.
_separate_enc_dec_inputs_from_mm_processor_outputs
(
...
...
vllm/inputs/registry.py
View file @
aec9674d
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
functools
from
collections
import
UserDict
from
collections.abc
import
Mapping
from
collections.abc
import
Mapping
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
NamedTuple
,
Optional
,
from
typing
import
TYPE_CHECKING
,
Any
,
NamedTuple
,
Optional
,
Union
Protocol
,
Union
)
from
torch
import
nn
from
transformers
import
BatchFeature
,
PretrainedConfig
,
ProcessorMixin
from
transformers
import
BatchFeature
,
PretrainedConfig
,
ProcessorMixin
from
typing_extensions
import
TypeVar
,
assert_never
from
typing_extensions
import
TypeVar
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.transformers_utils.processor
import
cached_processor_from_config
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
(
ClassRegistry
,
get_allowed_kwarg_only_overrides
,
from
vllm.utils
import
resolve_mm_processor_kwargs
resolve_mm_processor_kwargs
)
from
.data
import
ProcessorInputs
,
SingletonInputs
from
.parse
import
split_enc_dec_inputs
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
...
@@ -26,8 +16,6 @@ if TYPE_CHECKING:
...
@@ -26,8 +16,6 @@ if TYPE_CHECKING:
MultiModalRegistry
)
MultiModalRegistry
)
from
vllm.sequence
import
SequenceData
from
vllm.sequence
import
SequenceData
logger
=
init_logger
(
__name__
)
_T
=
TypeVar
(
"_T"
)
_T
=
TypeVar
(
"_T"
)
_C
=
TypeVar
(
"_C"
,
bound
=
PretrainedConfig
,
default
=
PretrainedConfig
)
_C
=
TypeVar
(
"_C"
,
bound
=
PretrainedConfig
,
default
=
PretrainedConfig
)
_P
=
TypeVar
(
"_P"
,
bound
=
ProcessorMixin
,
default
=
ProcessorMixin
)
_P
=
TypeVar
(
"_P"
,
bound
=
ProcessorMixin
,
default
=
ProcessorMixin
)
...
@@ -172,142 +160,23 @@ class InputProcessingContext(InputContext):
...
@@ -172,142 +160,23 @@ class InputProcessingContext(InputContext):
raise
RuntimeError
(
msg
)
from
exc
raise
RuntimeError
(
msg
)
from
exc
N
=
TypeVar
(
"N"
,
bound
=
type
[
nn
.
Module
])
class
DummyData
(
NamedTuple
):
class
DummyData
(
NamedTuple
):
"""Dummy data used for profiling."""
"""
Dummy data used for profiling.
Note: This is only used in V0.
"""
seq_data
:
"SequenceData"
seq_data
:
"SequenceData"
multi_modal_data
:
Optional
[
"MultiModalDataDict"
]
=
None
multi_modal_data
:
Optional
[
"MultiModalDataDict"
]
=
None
multi_modal_placeholders
:
Optional
[
"MultiModalPlaceholderDict"
]
=
None
multi_modal_placeholders
:
Optional
[
"MultiModalPlaceholderDict"
]
=
None
class
DummyDataFactory
(
Protocol
):
def
__call__
(
self
,
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
**
mm_processor_kwargs
:
Any
,
)
->
DummyData
:
"""
Create dummy data to be inputted into the model.
Note:
:data:`InputProcessor` is not applied to the dummy data.
The :code:`mm_processor_kwargs` are overrides provided at
initialization time to values in the config whose values
may affect the number of tokens per instance.
"""
...
class
_MultiModalCounts
(
UserDict
[
str
,
int
]):
"""
Wraps `mm_counts` for a more informative error message
when attempting to access a plugin that does not exist.
"""
def
__getitem__
(
self
,
key
:
str
)
->
int
:
try
:
return
super
().
__getitem__
(
key
)
except
KeyError
as
exc
:
msg
=
(
f
"There is no multi-modal plugin with the key:
{
key
}
. "
f
"Available keys:
{
set
(
self
.
keys
())
}
"
)
raise
KeyError
(
msg
)
from
exc
InputProcessor
=
Callable
[[
InputContext
,
ProcessorInputs
],
ProcessorInputs
]
"""Preprocess the inputs to the model."""
class
InputRegistry
:
class
InputRegistry
:
"""
"""
A registry to dispatch data processing
Note: This is only used in V0.
according to the target model.
"""
"""
def
__init__
(
self
)
->
None
:
self
.
_dummy_factories_by_model_type
=
\
ClassRegistry
[
nn
.
Module
,
DummyDataFactory
]()
self
.
_dummy_encoder_factories_by_model_type
=
\
ClassRegistry
[
nn
.
Module
,
DummyDataFactory
]()
self
.
_input_processors_by_model_type
=
\
ClassRegistry
[
nn
.
Module
,
InputProcessor
]()
def
_default_dummy_data_factory
(
self
,
ctx
:
InputContext
,
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
DummyData
:
"""
The default dummy data factory represents the longest possible text
that can be inputted to the model.
Note:
:data:`InputProcessor` is not applied to the dummy data.
"""
# Avoid circular import
from
vllm.sequence
import
SequenceData
return
DummyData
(
SequenceData
.
from_prompt_token_counts
((
0
,
seq_len
)))
def
register_dummy_data
(
self
,
factory
:
DummyDataFactory
):
"""
Register a dummy data factory to a model class.
During memory profiling, the provided function is invoked to create
dummy data to be inputted into the model. The resulting memory usage
should be an upper bound of what the model would use at inference time.
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
if
self
.
_dummy_factories_by_model_type
.
contains
(
model_cls
,
strict
=
True
):
logger
.
warning
(
"Model class %s already has dummy data "
"registered to %s. It is overwritten by the new one."
,
model_cls
,
self
)
self
.
_dummy_factories_by_model_type
[
model_cls
]
=
factory
return
model_cls
return
wrapper
def
_get_dummy_data_factory
(
self
,
model_cls
:
type
[
nn
.
Module
]):
return
self
.
_dummy_factories_by_model_type
\
.
get
(
model_cls
,
self
.
_default_dummy_data_factory
)
def
register_dummy_encoder_data
(
self
,
factory
:
DummyDataFactory
):
"""
Register a dummy encoder data factory to a model class
This is similar to :meth:`~register_dummy_data`, but for encoder input.
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
if
self
.
_dummy_encoder_factories_by_model_type
.
contains
(
model_cls
,
strict
=
True
):
logger
.
warning
(
"Model class %s already has dummy encoder data "
"registered to %s. It is overwritten by the new one."
,
model_cls
,
self
)
self
.
_dummy_encoder_factories_by_model_type
[
model_cls
]
=
factory
return
model_cls
return
wrapper
def
_get_dummy_encoder_data_factory
(
self
,
model_cls
:
type
[
nn
.
Module
]):
return
self
.
_dummy_encoder_factories_by_model_type
\
.
get
(
model_cls
,
self
.
_default_dummy_data_factory
)
def
dummy_data_for_profiling
(
def
dummy_data_for_profiling
(
self
,
self
,
model_config
:
"ModelConfig"
,
model_config
:
"ModelConfig"
,
...
@@ -319,169 +188,25 @@ class InputRegistry:
...
@@ -319,169 +188,25 @@ class InputRegistry:
Create dummy data for profiling the memory usage of a model.
Create dummy data for profiling the memory usage of a model.
The model is identified by ``model_config``.
The model is identified by ``model_config``.
Note:
This should be called after
:meth:`~MultiModalRegistry.init_mm_limits_per_prompt`.
"""
"""
# Avoid circular import
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.multimodal.profiling
import
MultiModalProfiler
from
vllm.sequence
import
SequenceData
from
vllm.sequence
import
SequenceData
if
mm_registry
.
has_processor
(
model_config
):
if
not
model_config
.
is_multimodal_model
:
processor
=
mm_registry
.
create_processor
(
model_config
,
seq_data
=
SequenceData
.
from_prompt_token_counts
((
0
,
seq_len
))
disable_cache
=
True
)
return
DummyData
(
seq_data
=
seq_data
)
profiler
=
MultiModalProfiler
(
processor
)
dummy_data_v1
=
(
profiler
.
get_encoder_dummy_data
(
seq_len
)
if
is_encoder_data
else
profiler
.
get_decoder_dummy_data
(
seq_len
))
_seq_data
=
SequenceData
.
from_seqs
(
dummy_data_v1
.
prompt_token_ids
)
# type: ignore[attr-defined]
dummy_data
=
DummyData
(
seq_data
=
_seq_data
,
multi_modal_data
=
getattr
(
dummy_data_v1
,
"multi_modal_data"
,
None
),
multi_modal_placeholders
=
getattr
(
dummy_data_v1
,
"multi_modal_placeholders"
,
None
),
)
else
:
model_cls
,
_
=
get_model_architecture
(
model_config
)
if
is_encoder_data
:
dummy_factory
=
self
.
_get_dummy_encoder_data_factory
(
model_cls
)
else
:
dummy_factory
=
self
.
_get_dummy_data_factory
(
model_cls
)
mm_counts
=
mm_registry
.
get_mm_limits_per_prompt
(
model_config
)
mm_processor_kwargs
=
get_allowed_kwarg_only_overrides
(
dummy_factory
,
overrides
=
model_config
.
mm_processor_kwargs
,
requires_kw_only
=
False
,
allow_var_kwargs
=
True
,
)
dummy_data
=
dummy_factory
(
InputContext
(
model_config
),
seq_len
,
_MultiModalCounts
(
mm_counts
),
**
mm_processor_kwargs
)
# Having more tokens is over-conservative but otherwise fine
num_tokens
=
dummy_data
.
seq_data
.
prompt_token_ids
if
len
(
num_tokens
)
<
seq_len
:
if
is_encoder_data
:
logger
.
warning_once
(
f
"Expected at least
{
seq_len
}
dummy encoder tokens for "
f
"profiling, but found
{
len
(
num_tokens
)
}
tokens instead."
)
else
:
raise
AssertionError
(
f
"Expected at least
{
seq_len
}
dummy tokens for profiling, "
f
"but found
{
len
(
num_tokens
)
}
tokens instead."
)
if
(
dummy_data
.
multi_modal_data
is
not
None
and
not
isinstance
(
dummy_data
.
multi_modal_data
,
MultiModalKwargs
)):
for
k
,
v
in
dummy_data
.
multi_modal_data
.
items
():
num_items
=
len
(
v
)
if
isinstance
(
v
,
list
)
else
1
num_expected
=
mm_counts
[
k
]
assert
num_items
>=
num_expected
,
(
f
"Expected at least
{
num_expected
}
dummy '
{
k
}
' instances "
f
"for profiling, but found
{
num_items
}
instances instead."
)
return
dummy_data
def
_default_input_processor
(
self
,
ctx
:
InputContext
,
inputs
:
ProcessorInputs
,
**
kwargs
:
object
,
)
->
ProcessorInputs
:
"""The default input processor is a no-op."""
return
inputs
def
register_input_processor
(
self
,
processor
:
InputProcessor
):
"""
Register an input processor to a model class.
The provided function is invoked on each input to the model. This
happens before
:meth:`~vllm.multimodal.registry.MultiModalRegistry.map_input`.
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
if
self
.
_input_processors_by_model_type
.
contains
(
model_cls
,
strict
=
True
):
logger
.
warning
(
"Model class %s already has input processor "
"registered to %s. It is overwritten by the new one."
,
model_cls
,
self
)
self
.
_input_processors_by_model_type
[
model_cls
]
=
processor
return
model_cls
return
wrapper
# Encoder dummy data does not contain multi-modal data
if
is_encoder_data
:
enc_data
=
mm_registry
.
get_encoder_dummy_data
(
model_config
,
seq_len
)
seq_data
=
SequenceData
.
from_seqs
(
enc_data
.
prompt_token_ids
)
return
DummyData
(
seq_data
=
seq_data
)
def
_get_model_input_processor
(
self
,
model_cls
:
type
[
nn
.
Module
]):
dec_data
=
mm_registry
.
get_decoder_dummy_data
(
model_config
,
seq_len
)
return
self
.
_input_processors_by_model_type
\
.
get
(
model_cls
,
self
.
_default_input_processor
)
def
_ensure_mm_kwargs
(
self
,
inputs
:
SingletonInputs
,
mm_processor_kwargs
:
dict
[
str
,
Any
],
):
if
inputs
[
"type"
]
==
"token"
:
# In case the input processor for that model fails to set it
if
"mm_processor_kwargs"
not
in
inputs
:
inputs
[
"mm_processor_kwargs"
]
=
mm_processor_kwargs
elif
inputs
[
"type"
]
==
"multimodal"
:
# Be more strict in V2
assert
"mm_kwargs"
in
inputs
else
:
assert_never
(
inputs
[
"type"
])
# type: ignore[arg-type]
def
process_input
(
self
,
model_config
:
"ModelConfig"
,
inputs
:
ProcessorInputs
)
->
ProcessorInputs
:
"""
Apply an input processor to an instance of model inputs.
The model is identified by ``model_config``.
"""
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
model_cls
,
_
=
get_model_architecture
(
model_config
)
processor
=
self
.
_get_model_input_processor
(
model_cls
)
# Handle multimodal processor kwargs with priority:
# Inference kwargs -> Init kwargs -> {}
# If it's empty, it'll fall back to the default kwarg values
mm_processor_kwargs
=
resolve_mm_processor_kwargs
(
model_config
.
mm_processor_kwargs
,
inputs
.
get
(
"mm_processor_kwargs"
,
{}),
# type: ignore
processor
,
requires_kw_only
=
False
,
allow_var_kwargs
=
True
,
)
processed_inputs
=
processor
(
return
DummyData
(
InputContext
(
model_config
),
seq_data
=
SequenceData
.
from_seqs
(
dec_data
.
prompt_token_ids
),
inputs
,
multi_modal_data
=
dec_data
.
multi_modal_data
,
**
mm_processor_kwarg
s
,
multi_modal_placeholders
=
dec_data
.
multi_modal_placeholder
s
,
)
)
encoder_inputs
,
decoder_inputs
=
split_enc_dec_inputs
(
processed_inputs
)
if
encoder_inputs
is
not
None
:
self
.
_ensure_mm_kwargs
(
encoder_inputs
,
mm_processor_kwargs
)
if
decoder_inputs
is
not
None
:
self
.
_ensure_mm_kwargs
(
decoder_inputs
,
mm_processor_kwargs
)
return
processed_inputs
def
create_input_processor
(
self
,
model_config
:
"ModelConfig"
):
"""
Create an input processor (see :meth:`_process_input`) for a
specific model.
"""
return
functools
.
partial
(
self
.
process_input
,
model_config
)
vllm/multimodal/__init__.py
View file @
aec9674d
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
.base
import
MultiModalPlaceholderMap
from
.base
import
MultiModalPlaceholderMap
,
MultiModalPlugin
from
.hasher
import
MultiModalHashDict
,
MultiModalHasher
from
.hasher
import
MultiModalHashDict
,
MultiModalHasher
from
.inputs
import
(
BatchedTensorInputs
,
ModalityData
,
MultiModalDataBuiltins
,
from
.inputs
import
(
BatchedTensorInputs
,
ModalityData
,
MultiModalDataBuiltins
,
MultiModalDataDict
,
MultiModalKwargs
,
MultiModalDataDict
,
MultiModalKwargs
,
...
@@ -26,7 +25,6 @@ __all__ = [
...
@@ -26,7 +25,6 @@ __all__ = [
"MultiModalKwargs"
,
"MultiModalKwargs"
,
"MultiModalPlaceholderDict"
,
"MultiModalPlaceholderDict"
,
"MultiModalPlaceholderMap"
,
"MultiModalPlaceholderMap"
,
"MultiModalPlugin"
,
"NestedTensors"
,
"NestedTensors"
,
"MULTIMODAL_REGISTRY"
,
"MULTIMODAL_REGISTRY"
,
"MultiModalRegistry"
,
"MultiModalRegistry"
,
...
...
vllm/multimodal/audio.py
View file @
aec9674d
...
@@ -7,11 +7,9 @@ from typing import Literal, Optional
...
@@ -7,11 +7,9 @@ from typing import Literal, Optional
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
from
vllm.inputs.registry
import
InputContext
from
vllm.utils
import
PlaceholderModule
from
vllm.utils
import
PlaceholderModule
from
.base
import
MediaIO
,
MultiModalPlugin
from
.base
import
MediaIO
from
.inputs
import
AudioItem
,
ModalityData
,
MultiModalKwargs
try
:
try
:
import
librosa
import
librosa
...
@@ -24,25 +22,6 @@ except ImportError:
...
@@ -24,25 +22,6 @@ except ImportError:
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
soundfile
=
PlaceholderModule
(
"soundfile"
)
# type: ignore[assignment]
class
AudioPlugin
(
MultiModalPlugin
):
"""Plugin for audio data."""
def
get_data_key
(
self
)
->
str
:
return
"audio"
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
ModalityData
[
AudioItem
],
**
mm_processor_kwargs
,
)
->
MultiModalKwargs
:
raise
NotImplementedError
(
"There is no default audio input mapper"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
raise
NotImplementedError
(
"There is no default maximum multimodal tokens"
)
def
resample_audio_librosa
(
def
resample_audio_librosa
(
audio
:
npt
.
NDArray
[
np
.
floating
],
audio
:
npt
.
NDArray
[
np
.
floating
],
*
,
*
,
...
...
vllm/multimodal/base.py
View file @
aec9674d
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
collections
import
defaultdict
from
collections.abc
import
Sequence
from
collections.abc
import
Sequence
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
(
TYPE_CHECKING
,
Any
,
Callable
,
Generic
,
NamedTuple
,
from
typing
import
TYPE_CHECKING
,
Generic
,
NamedTuple
,
TypeVar
Optional
,
TypeVar
,
Union
)
from
torch
import
nn
from
vllm.inputs
import
InputContext
from
vllm.logger
import
init_logger
from
vllm.utils
import
(
ClassRegistry
,
get_allowed_kwarg_only_overrides
,
resolve_mm_processor_kwargs
)
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
from
vllm.sequence
import
SequenceGroupMetadata
from
vllm.sequence
import
SequenceGroupMetadata
from
.inputs
import
(
ModalityData
,
MultiModalDataDict
,
MultiModalKwargs
,
from
.inputs
import
MultiModalKwargs
,
PlaceholderRange
PlaceholderRange
)
logger
=
init_logger
(
__name__
)
MultiModalInputMapper
=
Callable
[[
InputContext
,
ModalityData
[
object
]],
MultiModalKwargs
]
"""
Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
and processors in HuggingFace Transformers.
If the data is not supported, throw :exc:`TypeError`.
"""
MultiModalTokensCalc
=
Union
[
int
,
Callable
[[
InputContext
],
int
]]
"""
Calculate the maximum number of multimodal tokens input to the language
model. This does not include tokens that correspond to the input text.
"""
_T
=
TypeVar
(
"_T"
)
_T
=
TypeVar
(
"_T"
)
N
=
TypeVar
(
"N"
,
bound
=
type
[
nn
.
Module
])
class
MultiModalPlugin
(
ABC
):
"""
Base class that defines data processing logic for a specific modality.
In particular, we adopt a registry pattern to dispatch data processing
according to the model being used (considering that different models may
process the same data differently). This registry is in turn used by
:class:`~MultiModalRegistry` which acts at a higher level
(i.e., the modality of the data).
"""
def
__init__
(
self
)
->
None
:
self
.
_input_mappers
=
ClassRegistry
[
nn
.
Module
,
MultiModalInputMapper
]()
self
.
_max_mm_tokens
=
ClassRegistry
[
nn
.
Module
,
MultiModalTokensCalc
]()
@
abstractmethod
def
get_data_key
(
self
)
->
str
:
"""
Get the data key corresponding to the modality.
"""
raise
NotImplementedError
@
abstractmethod
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
ModalityData
[
Any
],
**
mm_processor_kwargs
,
)
->
MultiModalKwargs
:
"""
Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to
tokenizers and processors in HuggingFace Transformers.
If the data is not supported, throw :exc:`TypeError`.
"""
raise
NotImplementedError
def
register_input_mapper
(
self
,
mapper
:
Optional
[
MultiModalInputMapper
]
=
None
,
):
"""
Register an input mapper to a model class.
When the model receives input data that matches the modality served by
this plugin (see :meth:`get_data_key`), the provided function is
invoked to transform the data into a dictionary of model inputs.
If `None` is provided, then the default input mapper is used instead.
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
if
self
.
_input_mappers
.
contains
(
model_cls
,
strict
=
True
):
logger
.
warning
(
"Model class %s already has an input mapper "
"registered to %s. It is overwritten by the new one."
,
model_cls
,
self
,
)
self
.
_input_mappers
[
model_cls
]
=
(
mapper
or
self
.
_default_input_mapper
)
return
model_cls
return
wrapper
def
map_input
(
self
,
model_config
:
"ModelConfig"
,
data
:
ModalityData
[
Any
],
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]],
)
->
MultiModalKwargs
:
"""
Transform the data into a dictionary of model inputs using the
input mapper registered for that model.
The model is identified by ``model_config``.
Raises:
TypeError: If the data type is not supported.
"""
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
model_cls
,
_
=
get_model_architecture
(
model_config
)
mapper
=
self
.
_input_mappers
.
get
(
model_cls
)
if
mapper
is
None
:
raise
KeyError
(
f
"No input mapper in
{
self
}
is registered for "
f
"model class
{
model_cls
.
__name__
}
."
)
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
# In the case of the default mapper, we have to get resource
# processor through its HuggingFace autoclass; since this goes
# through **kwargs, we can't inspect it the same way, so we allow
# drop mm_processor_kwargs based on signature inspection
# if we're using the default mapper.
#
# This should be safe in general due to the sanitation, since the
# transformers resource should filter unused kwargs anyway.
uses_default_mapper
=
mapper
==
self
.
_default_input_mapper
mm_processor_kwargs
=
resolve_mm_processor_kwargs
(
model_config
.
mm_processor_kwargs
,
mm_processor_kwargs
,
callable
=
mapper
,
allow_var_kwargs
=
uses_default_mapper
,
)
return
mapper
(
InputContext
(
model_config
),
data
,
**
mm_processor_kwargs
)
@
abstractmethod
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
"""
Calculate the maximum number of tokens, corresponding to a single
instance of multimodal data, that are passed to the language model.
"""
raise
NotImplementedError
def
_validate_max_multimodal_tokens
(
self
,
max_mm_tokens
:
int
):
if
max_mm_tokens
<
1
:
raise
ValueError
(
"You should set the number of tokens to a "
f
"positive integer. Found:
{
max_mm_tokens
}
"
)
def
register_max_multimodal_tokens
(
self
,
max_mm_tokens
:
Optional
[
MultiModalTokensCalc
]
=
None
,
):
"""
Register the maximum number of tokens, corresponding to a single
instance of multimodal data, that are passed to the language model
for a model class.
If `None` is provided, then the default calculation is used instead.
"""
def
wrapper
(
model_cls
:
N
)
->
N
:
if
self
.
_max_mm_tokens
.
contains
(
model_cls
,
strict
=
True
):
logger
.
warning
(
"Model class %s already calculates maximum number of "
"tokens in %s. It is overwritten by the new one."
,
model_cls
,
self
,
)
if
isinstance
(
max_mm_tokens
,
int
):
self
.
_validate_max_multimodal_tokens
(
max_mm_tokens
)
self
.
_max_mm_tokens
[
model_cls
]
=
(
max_mm_tokens
or
self
.
_default_max_multimodal_tokens
)
return
model_cls
return
wrapper
def
get_max_multimodal_tokens
(
self
,
model_config
:
"ModelConfig"
)
->
int
:
"""
Get the maximum number of multi-modal tokens
for profiling the memory usage of a model.
If this registry is not applicable to the model, `0` is returned.
The model is identified by ``model_config``.
"""
# Avoid circular import
from
vllm.model_executor.model_loader
import
get_model_architecture
from
vllm.model_executor.models
import
supports_multimodal
model_cls
,
_
=
get_model_architecture
(
model_config
)
if
not
supports_multimodal
(
model_cls
):
return
0
max_mm_tokens
=
self
.
_max_mm_tokens
.
get
(
model_cls
)
if
max_mm_tokens
is
None
:
return
0
if
callable
(
max_mm_tokens
):
mm_processor_kwargs
=
get_allowed_kwarg_only_overrides
(
max_mm_tokens
,
overrides
=
model_config
.
mm_processor_kwargs
,
requires_kw_only
=
False
,
allow_var_kwargs
=
True
,
)
max_mm_tokens
=
max_mm_tokens
(
InputContext
(
model_config
),
**
mm_processor_kwargs
)
self
.
_validate_max_multimodal_tokens
(
max_mm_tokens
)
return
max_mm_tokens
class
MultiModalPlaceholderMap
:
class
MultiModalPlaceholderMap
:
"""
"""
Relates multi-modal embeddings to their corresponding placeholders.
Relates multi-modal embeddings to their corresponding placeholders.
Note: This is only used in V0.
"""
"""
class
IndexMap
(
NamedTuple
):
class
IndexMap
(
NamedTuple
):
...
@@ -279,8 +55,7 @@ class MultiModalPlaceholderMap:
...
@@ -279,8 +55,7 @@ class MultiModalPlaceholderMap:
@
classmethod
@
classmethod
def
from_seq_group
(
def
from_seq_group
(
cls
,
seq_group
:
"SequenceGroupMetadata"
,
positions
:
range
cls
,
seq_group
:
"SequenceGroupMetadata"
,
positions
:
range
)
->
tuple
[
Optional
[
MultiModalDataDict
],
dict
[
str
,
)
->
tuple
[
MultiModalKwargs
,
dict
[
str
,
"MultiModalPlaceholderMap"
]]:
"MultiModalPlaceholderMap"
]]:
"""
"""
Returns the multi-modal items that intersect with the portion of a
Returns the multi-modal items that intersect with the portion of a
prompt (``seq_group``) represented by ``positions``, as well as a
prompt (``seq_group``) represented by ``positions``, as well as a
...
@@ -323,48 +98,24 @@ class MultiModalPlaceholderMap:
...
@@ -323,48 +98,24 @@ class MultiModalPlaceholderMap:
seq_mm_placeholders
=
seq_group
.
multi_modal_placeholders
seq_mm_placeholders
=
seq_group
.
multi_modal_placeholders
if
not
seq_mm_data
or
not
seq_mm_placeholders
:
if
not
seq_mm_data
or
not
seq_mm_placeholders
:
return
seq_mm_data
,
{}
return
MultiModalKwargs
({}),
{}
# For merged processor, we directly use mm_kwargs as mm_data
if
isinstance
(
seq_mm_data
,
MultiModalKwargs
):
placeholder_maps
=
dict
[
str
,
MultiModalPlaceholderMap
]()
for
modality
,
placeholders
in
seq_mm_placeholders
.
items
():
placeholder_map
=
MultiModalPlaceholderMap
()
if
positions
:
placeholder_maps
=
dict
[
str
,
MultiModalPlaceholderMap
]()
placeholder_map
.
append_items_from_seq_group
(
positions
,
# Dummy, since we don't care about intersecting items
[
None
]
*
len
(
placeholders
),
placeholders
,
)
placeholder_maps
[
modality
]
=
placeholder_map
return
seq_mm_data
,
placeholder_maps
mm_data
=
{
**
seq_mm_data
}
placeholder_maps
=
defaultdict
[
str
,
MultiModalPlaceholderMap
](
MultiModalPlaceholderMap
)
for
modality
,
placeholders
in
seq_mm_placeholders
.
items
():
for
modality
,
placeholders
in
seq_mm_placeholders
.
items
():
mm_items
=
mm_data
.
pop
(
modality
)
placeholder_map
=
MultiModalPlaceholderMap
()
if
not
isinstance
(
mm_items
,
list
):
mm_items
=
[
mm_items
]
if
positions
:
if
positions
:
intersecting_items
=
placeholder_maps
[
modality
]
\
placeholder_map
.
append_items_from_seq_group
(
.
append_items_from_seq_group
(
positions
,
positions
,
# Dummy, since we don't care about intersecting items
mm_items
,
[
None
]
*
len
(
placeholders
)
,
placeholders
,
placeholders
,
)
)
if
intersecting_items
:
placeholder_maps
[
modality
]
=
placeholder_map
mm_data
[
modality
]
=
intersecting_items
return
mm_data
,
placeholder_maps
return
seq_
mm_data
,
placeholder_maps
def
append_items_from_seq_group
(
def
append_items_from_seq_group
(
self
,
self
,
...
@@ -445,8 +196,7 @@ class MultiModalPlaceholderMap:
...
@@ -445,8 +196,7 @@ class MultiModalPlaceholderMap:
f
"The number of source (
{
len
(
src_indices
)
}
) and destination "
f
"The number of source (
{
len
(
src_indices
)
}
) and destination "
f
"indices (
{
len
(
dest_indices
)
}
) must be the same."
)
f
"indices (
{
len
(
dest_indices
)
}
) must be the same."
)
return
MultiModalPlaceholderMap
.
IndexMap
(
src
=
src_indices
,
return
self
.
IndexMap
(
src
=
src_indices
,
dest
=
dest_indices
)
dest
=
dest_indices
)
class
MediaIO
(
ABC
,
Generic
[
_T
]):
class
MediaIO
(
ABC
,
Generic
[
_T
]):
...
...
vllm/multimodal/image.py
View file @
aec9674d
...
@@ -3,89 +3,11 @@
...
@@ -3,89 +3,11 @@
import
base64
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
import
torch
import
torch
from
PIL
import
Image
from
PIL
import
Image
from
vllm.inputs.registry
import
InputContext
from
.base
import
MediaIO
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
cached_get_image_processor
from
vllm.utils
import
is_list_of
from
.base
import
MediaIO
,
MultiModalPlugin
from
.inputs
import
ImageItem
,
ModalityData
,
MultiModalKwargs
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
logger
=
init_logger
(
__name__
)
class
ImagePlugin
(
MultiModalPlugin
):
"""Plugin for image data."""
def
get_data_key
(
self
)
->
str
:
return
"image"
def
_get_hf_image_processor
(
self
,
model_config
:
"ModelConfig"
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
):
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
return
cached_get_image_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
mm_processor_kwargs
)
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
ModalityData
[
ImageItem
],
**
mm_processor_kwargs
,
)
->
MultiModalKwargs
:
model_config
=
ctx
.
model_config
# PIL image
if
isinstance
(
data
,
Image
.
Image
)
or
is_list_of
(
data
,
Image
.
Image
):
image_processor
=
self
.
_get_hf_image_processor
(
model_config
,
mm_processor_kwargs
,
)
if
image_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the image object"
)
try
:
# NOTE: It may make sense to forward the mm_processor_kwargs
# here too. For now, to keep it simple, we only allow it be
# used for the initialization call though, just in case the
# signatures of the preprocessor initializer don't match
# preprocess()
batch_data
=
image_processor
\
.
preprocess
(
data
,
return_tensors
=
"pt"
)
\
.
data
except
Exception
:
logger
.
error
(
"Failed to process image (%s) with the default mapper. "
"This is most likely an edge-case with this model's image "
"processor in transformers (type: %s), and not vLLM."
,
data
,
type
(
image_processor
).
__name__
)
raise
return
MultiModalKwargs
(
batch_data
)
# Image embedding
elif
isinstance
(
data
,
torch
.
Tensor
)
or
is_list_of
(
data
,
torch
.
Tensor
):
return
MultiModalKwargs
({
"image_embeds"
:
data
})
raise
TypeError
(
f
"Invalid image type:
{
type
(
data
)
}
"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
3000
def
rescale_image_size
(
image
:
Image
.
Image
,
def
rescale_image_size
(
image
:
Image
.
Image
,
...
...
vllm/multimodal/registry.py
View file @
aec9674d
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Mapping
import
functools
import
json
from
collections
import
UserDict
from
collections.abc
import
Mapping
,
Sequence
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
Any
,
Generic
,
Optional
,
Protocol
,
TypeVar
from
typing
import
TYPE_CHECKING
,
Generic
,
Optional
,
Protocol
,
TypeVar
import
torch.nn
as
nn
import
torch.nn
as
nn
from
typing_extensions
import
deprecated
from
vllm.envs
import
VLLM_MM_INPUT_CACHE_GIB
from
vllm.envs
import
VLLM_MM_INPUT_CACHE_GIB
from
vllm.inputs
import
InputProcessingContext
from
vllm.inputs
import
InputProcessingContext
...
@@ -16,15 +13,10 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
...
@@ -16,15 +13,10 @@ from vllm.transformers_utils.tokenizer import (AnyTokenizer,
cached_tokenizer_from_config
)
cached_tokenizer_from_config
)
from
vllm.utils
import
ClassRegistry
from
vllm.utils
import
ClassRegistry
from
.audio
import
AudioPlugin
from
.base
import
MultiModalInputMapper
,
MultiModalPlugin
,
MultiModalTokensCalc
from
.image
import
ImagePlugin
from
.inputs
import
MultiModalDataDict
,
MultiModalKwargs
,
NestedTensors
from
.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
from
.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
ProcessingCache
)
ProcessingCache
)
from
.profiling
import
(
BaseDummyInputsBuilder
,
DummyDecoderData
,
from
.profiling
import
(
BaseDummyInputsBuilder
,
DummyDecoderData
,
DummyEncoderData
,
MultiModalProfiler
)
DummyEncoderData
,
MultiModalProfiler
)
from
.video
import
VideoPlugin
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
...
@@ -85,169 +77,23 @@ class _ProcessorFactories(Generic[_I]):
...
@@ -85,169 +77,23 @@ class _ProcessorFactories(Generic[_I]):
return
self
.
processor
(
info
,
dummy_inputs_builder
,
cache
=
cache
)
return
self
.
processor
(
info
,
dummy_inputs_builder
,
cache
=
cache
)
class
_MultiModalLimits
(
UserDict
[
"ModelConfig"
,
dict
[
str
,
int
]]):
"""
Wraps `_limits_by_model` for a more informative error message
when attempting to access a model that does not exist.
"""
def
__getitem__
(
self
,
key
:
"ModelConfig"
)
->
dict
[
str
,
int
]:
try
:
return
super
().
__getitem__
(
key
)
except
KeyError
as
exc
:
msg
=
(
f
"Cannot find `mm_limits` for model=
{
key
.
model
}
. Did you "
"forget to call `init_mm_limits_per_prompt`?"
)
raise
KeyError
(
msg
)
from
exc
class
MultiModalRegistry
:
class
MultiModalRegistry
:
"""
"""
A registry that dispatches data processing according to the model.
A registry that dispatches data processing according to the model.
"""
"""
DEFAULT_PLUGINS
=
(
ImagePlugin
(),
AudioPlugin
(),
VideoPlugin
())
def
__init__
(
self
)
->
None
:
def
__init__
(
self
,
*
,
plugins
:
Sequence
[
MultiModalPlugin
]
=
DEFAULT_PLUGINS
)
->
None
:
self
.
_plugins
=
{
p
.
get_data_key
():
p
for
p
in
plugins
}
self
.
_processor_factories
=
ClassRegistry
[
nn
.
Module
,
self
.
_processor_factories
=
ClassRegistry
[
nn
.
Module
,
_ProcessorFactories
]()
_ProcessorFactories
]()
# This is used for non-multimodal models
self
.
_disabled_limits_per_plugin
=
{
k
:
0
for
k
in
self
.
_plugins
}
self
.
_limits_by_model
=
_MultiModalLimits
()
self
.
_processing_cache
=
ProcessingCache
(
VLLM_MM_INPUT_CACHE_GIB
)
self
.
_processing_cache
=
ProcessingCache
(
VLLM_MM_INPUT_CACHE_GIB
)
def
register_plugin
(
self
,
plugin
:
MultiModalPlugin
)
->
None
:
@
deprecated
(
"Legacy input processor/mapper pipeline has been removed. "
"""
"Please update your model runner to use "
Register a multi-modal plugin so it can be recognized by vLLM.
"`seq_group_metadata.multi_modal_data` directly without "
"""
"further processing."
)
data_type_key
=
plugin
.
get_data_key
()
if
data_type_key
in
self
.
_plugins
:
logger
.
warning
(
"A plugin is already registered for data type %s, "
"and will be overwritten by the new plugin %s."
,
data_type_key
,
plugin
)
self
.
_plugins
[
data_type_key
]
=
plugin
def
_get_plugin
(
self
,
data_type_key
:
str
):
plugin
=
self
.
_plugins
.
get
(
data_type_key
)
if
plugin
is
not
None
:
return
plugin
msg
=
f
"Unknown multi-modal data type:
{
data_type_key
}
"
raise
NotImplementedError
(
msg
)
def
register_input_mapper
(
self
,
data_type_key
:
str
,
mapper
:
Optional
[
MultiModalInputMapper
]
=
None
,
):
"""
Register an input mapper for a specific modality to a model class.
See :meth:`MultiModalPlugin.register_input_mapper` for more details.
"""
return
self
.
_get_plugin
(
data_type_key
).
register_input_mapper
(
mapper
)
def
register_image_input_mapper
(
self
,
mapper
:
Optional
[
MultiModalInputMapper
]
=
None
,
):
"""
Register an input mapper for image data to a model class.
See :meth:`MultiModalPlugin.register_input_mapper` for more details.
"""
return
self
.
register_input_mapper
(
"image"
,
mapper
)
def
map_input
(
self
,
model_config
:
"ModelConfig"
,
data
:
MultiModalDataDict
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
MultiModalKwargs
:
"""
Apply an input mapper to the data passed to the model.
The data belonging to each modality is passed to the corresponding
plugin which in turn converts the data into into keyword arguments
via the input mapper registered for that model.
See :meth:`MultiModalPlugin.map_input` for more details.
Note:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
merged_dict
=
dict
[
str
,
NestedTensors
]()
for
data_key
,
data_value
in
data
.
items
():
plugin
=
self
.
_get_plugin
(
data_key
)
num_items
=
len
(
data_value
)
if
isinstance
(
data_value
,
list
)
else
1
max_items
=
self
.
_limits_by_model
[
model_config
][
data_key
]
if
num_items
>
max_items
:
raise
ValueError
(
f
"You set '
{
json
.
dumps
(
{
data_key
:
max_items
}
)
}
' (or "
"defaulted to 1) in `--limit-mm-per-prompt`, but found "
f
"
{
num_items
}
items in the same prompt."
)
input_dict
=
plugin
.
map_input
(
model_config
,
data_value
,
mm_processor_kwargs
)
for
input_key
,
input_tensor
in
input_dict
.
items
():
if
input_key
in
merged_dict
:
raise
ValueError
(
f
"The input mappers (keys=
{
set
(
data
)
}
) "
f
"resulted in a conflicting keyword "
f
"argument to `forward()`:
{
input_key
}
"
)
merged_dict
[
input_key
]
=
input_tensor
return
MultiModalKwargs
(
merged_dict
)
def
create_input_mapper
(
self
,
model_config
:
"ModelConfig"
):
def
create_input_mapper
(
self
,
model_config
:
"ModelConfig"
):
"""
return
lambda
data
,
mm_processor_kwargs
:
data
Create an input mapper (see :meth:`map_input`) for a specific model.
"""
# NOTE - we currently make the assumption that if a model has multiple
# supported modalities, they take the same kwargs. For the default,
# this could be an issue in the future if it falls back to two HF
# resources and we can't inspect the signature easily since it's
# getting initialized through the autoclass.
#
# If this is a problem in the future, we should revisit it, but since
# it potentially introduces a lot of complexity for a currently
# uncommon case, we do not for simplicity of both use & implementation
return
functools
.
partial
(
self
.
map_input
,
model_config
)
def
register_max_multimodal_tokens
(
self
,
data_type_key
:
str
,
max_mm_tokens
:
Optional
[
MultiModalTokensCalc
]
=
None
,
):
"""
Register the maximum number of tokens, corresponding to a single
instance of multimodal data belonging to a specific modality, that are
passed to the language model for a model class.
"""
return
self
.
_get_plugin
(
data_type_key
)
\
.
register_max_multimodal_tokens
(
max_mm_tokens
)
def
register_max_image_tokens
(
self
,
max_mm_tokens
:
Optional
[
MultiModalTokensCalc
]
=
None
,
):
"""
Register the maximum number of image tokens, corresponding to a single
image, that are passed to the language model for a model class.
"""
return
self
.
register_max_multimodal_tokens
(
"image"
,
max_mm_tokens
)
def
get_max_tokens_per_item_by_modality
(
def
get_max_tokens_per_item_by_modality
(
self
,
self
,
...
@@ -257,25 +103,22 @@ class MultiModalRegistry:
...
@@ -257,25 +103,22 @@ class MultiModalRegistry:
Get the maximum number of tokens per data item from each modality based
Get the maximum number of tokens per data item from each modality based
on underlying model configuration.
on underlying model configuration.
"""
"""
if
self
.
has_processor
(
model_config
):
if
not
model_config
.
is_multimodal_model
:
processor
=
self
.
create_processor
(
model_config
,
disable_cache
=
True
)
return
{}
profiler
=
MultiModalProfiler
(
processor
)
seq_len
=
model_config
.
max_model_len
mm_limits
=
self
.
get_mm_limits_per_prompt
(
model_config
)
return
profiler
.
get_mm_max_tokens
(
seq_len
,
{
modality
:
1
for
modality
,
limit
in
mm_limits
.
items
()
if
limit
>
0
},
)
return
{
processor
=
self
.
create_processor
(
model_config
,
disable_cache
=
True
)
key
:
plugin
.
get_max_multimodal_tokens
(
model_config
)
profiler
=
MultiModalProfiler
(
processor
)
for
key
,
plugin
in
self
.
_plugins
.
items
()
}
seq_len
=
model_config
.
max_model_len
mm_limits
=
self
.
get_mm_limits_per_prompt
(
model_config
)
return
profiler
.
get_mm_max_tokens
(
seq_len
,
{
modality
:
1
for
modality
,
limit
in
mm_limits
.
items
()
if
limit
>
0
},
)
def
get_max_tokens_per_item_by_nonzero_modality
(
def
get_max_tokens_per_item_by_nonzero_modality
(
self
,
self
,
...
@@ -308,9 +151,6 @@ class MultiModalRegistry:
...
@@ -308,9 +151,6 @@ class MultiModalRegistry:
for profiling the memory usage of a model.
for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
Note:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
"""
mm_limits
=
self
.
get_mm_limits_per_prompt
(
model_config
)
mm_limits
=
self
.
get_mm_limits_per_prompt
(
model_config
)
...
@@ -326,47 +166,18 @@ class MultiModalRegistry:
...
@@ -326,47 +166,18 @@ class MultiModalRegistry:
for profiling the memory usage of a model.
for profiling the memory usage of a model.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details.
Note:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
"""
return
sum
(
self
.
get_max_tokens_by_modality
(
model_config
).
values
())
return
sum
(
self
.
get_max_tokens_by_modality
(
model_config
).
values
())
@
deprecated
(
"Legacy input processor/mapper pipeline has been removed. "
"Please update your model runner to use "
"`seq_group_metadata.multi_modal_data` directly without "
"further processing."
)
def
init_mm_limits_per_prompt
(
def
init_mm_limits_per_prompt
(
self
,
self
,
model_config
:
"ModelConfig"
,
model_config
:
"ModelConfig"
,
)
->
None
:
)
->
None
:
"""
pass
Initialize the maximum number of multi-modal input instances for each
modality that are allowed per prompt for a model class.
"""
if
model_config
in
self
.
_limits_by_model
:
logger
.
warning
(
"`mm_limits` has already been set for model=%s, and will "
"be overwritten by the new values."
,
model_config
.
model
)
multimodal_config
=
model_config
.
multimodal_config
if
multimodal_config
is
None
:
limits_per_plugin
=
self
.
_disabled_limits_per_plugin
else
:
config_limits_per_plugin
=
multimodal_config
.
limit_per_prompt
extra_keys
=
config_limits_per_plugin
.
keys
()
-
self
.
_plugins
.
keys
()
if
extra_keys
:
logger
.
warning
(
"Detected extra keys in `--limit-mm-per-prompt` which "
"are not registered as multi-modal plugins: %s. "
"They will be ignored."
,
extra_keys
)
# NOTE: Currently the default is set to 1 for each plugin
# TODO: Automatically determine the limits based on budget
# once more models support multi-image inputs
limits_per_plugin
=
{
key
:
multimodal_config
.
get_limit_per_prompt
(
key
)
for
key
in
self
.
_plugins
}
self
.
_limits_by_model
[
model_config
]
=
limits_per_plugin
def
get_mm_limits_per_prompt
(
def
get_mm_limits_per_prompt
(
self
,
self
,
...
@@ -375,16 +186,13 @@ class MultiModalRegistry:
...
@@ -375,16 +186,13 @@ class MultiModalRegistry:
"""
"""
Get the maximum number of multi-modal input instances for each modality
Get the maximum number of multi-modal input instances for each modality
that are allowed per prompt for a model class.
that are allowed per prompt for a model class.
Note:
This should be called after :meth:`init_mm_limits_per_prompt`.
"""
"""
if
self
.
has_processor
(
model_config
):
if
not
model_config
.
is_multimodal_model
:
processor
=
self
.
create_processor
(
model_config
,
disable_cache
=
True
)
return
{}
profiler
=
MultiModalProfiler
(
processor
)
return
profiler
.
get_mm_limits
()
return
self
.
_limits_by_model
[
model_config
]
processor
=
self
.
create_processor
(
model_config
,
disable_cache
=
True
)
profiler
=
MultiModalProfiler
(
processor
)
return
profiler
.
get_mm_limits
()
def
register_processor
(
def
register_processor
(
self
,
self
,
...
@@ -428,14 +236,12 @@ class MultiModalRegistry:
...
@@ -428,14 +236,12 @@ class MultiModalRegistry:
model_cls
,
_
=
get_model_architecture
(
model_config
)
model_cls
,
_
=
get_model_architecture
(
model_config
)
return
model_cls
return
model_cls
@
deprecated
(
"Legacy input processor/mapper pipeline has been removed. "
"Please update your model runner to use "
"`seq_group_metadata.multi_modal_data` directly without "
"further processing."
)
def
has_processor
(
self
,
model_config
:
"ModelConfig"
)
->
bool
:
def
has_processor
(
self
,
model_config
:
"ModelConfig"
)
->
bool
:
"""
return
True
Test whether a multi-modal processor is defined for a specific model.
See also:
:ref:`mm-processing`
"""
return
self
.
_get_model_cls
(
model_config
)
in
self
.
_processor_factories
def
create_processor
(
def
create_processor
(
self
,
self
,
...
@@ -450,6 +256,9 @@ class MultiModalRegistry:
...
@@ -450,6 +256,9 @@ class MultiModalRegistry:
See also:
See also:
:ref:`mm-processing`
:ref:`mm-processing`
"""
"""
if
not
model_config
.
is_multimodal_model
:
raise
ValueError
(
f
"
{
model_config
.
model
}
is not a multimodal model"
)
if
tokenizer
is
None
:
if
tokenizer
is
None
:
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
tokenizer
=
cached_tokenizer_from_config
(
model_config
)
if
disable_cache
is
None
:
if
disable_cache
is
None
:
...
...
vllm/multimodal/video.py
View file @
aec9674d
...
@@ -4,80 +4,13 @@ import base64
...
@@ -4,80 +4,13 @@ import base64
from
functools
import
partial
from
functools
import
partial
from
io
import
BytesIO
from
io
import
BytesIO
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
import
numpy
as
np
import
numpy
as
np
import
numpy.typing
as
npt
import
numpy.typing
as
npt
from
PIL
import
Image
from
PIL
import
Image
from
vllm.inputs.registry
import
InputContext
from
.base
import
MediaIO
from
vllm.logger
import
init_logger
from
.image
import
ImageMediaIO
from
vllm.transformers_utils.processor
import
cached_get_video_processor
from
vllm.utils
import
is_list_of
from
.base
import
MediaIO
,
ModalityData
from
.image
import
ImageMediaIO
,
ImagePlugin
from
.inputs
import
MultiModalKwargs
,
VideoItem
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
logger
=
init_logger
(
__name__
)
class
VideoPlugin
(
ImagePlugin
):
"""Plugin for video data."""
def
get_data_key
(
self
)
->
str
:
return
"video"
def
_get_hf_video_processor
(
self
,
model_config
:
"ModelConfig"
,
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
):
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
return
cached_get_video_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
mm_processor_kwargs
)
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
ModalityData
[
VideoItem
],
**
mm_processor_kwargs
,
)
->
MultiModalKwargs
:
model_config
=
ctx
.
model_config
if
isinstance
(
data
,
list
)
and
len
(
data
)
==
1
:
data
=
data
[
0
]
# type: ignore
if
isinstance
(
data
,
np
.
ndarray
)
or
is_list_of
(
data
,
np
.
ndarray
):
video_processor
=
self
.
_get_hf_video_processor
(
model_config
,
mm_processor_kwargs
,
)
if
video_processor
is
None
:
raise
RuntimeError
(
"No HuggingFace processor is available "
"to process the video object"
)
try
:
# NOTE: Similar to image; it may be a good idea to filter and
# pass mm_processor_kwargs here too, but for now we don't to
# avoid extra complexity if the initializer and preprocess
# signatures of the processor don't align
batch_data
=
video_processor
(
data
,
return_tensors
=
"pt"
).
data
except
Exception
:
logger
.
error
(
"Failed to process video (%s)"
,
data
)
raise
return
MultiModalKwargs
(
batch_data
)
raise
TypeError
(
f
"Invalid video type:
{
type
(
data
)
}
"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
return
4096
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
tuple
[
int
,
int
])
->
npt
.
NDArray
:
def
resize_video
(
frames
:
npt
.
NDArray
,
size
:
tuple
[
int
,
int
])
->
npt
.
NDArray
:
...
...
vllm/sequence.py
View file @
aec9674d
...
@@ -14,9 +14,9 @@ from typing import Any, Callable, Optional, Union
...
@@ -14,9 +14,9 @@ from typing import Any, Callable, Optional, Union
import
msgspec
import
msgspec
import
torch
import
torch
from
vllm.inputs
import
SingletonInputs
,
SingletonInputsAdapter
from
vllm.inputs
import
SingletonInputs
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MultiModal
DataDict
,
MultiModalPlaceholderDict
from
vllm.multimodal
import
MultiModal
Kwargs
,
MultiModalPlaceholderDict
from
vllm.pooling_params
import
PoolingParams
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
RequestOutputKind
,
SamplingParams
from
vllm.sampling_params
import
RequestOutputKind
,
SamplingParams
...
@@ -419,7 +419,7 @@ class Sequence:
...
@@ -419,7 +419,7 @@ class Sequence:
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
None
:
)
->
None
:
self
.
seq_id
=
seq_id
self
.
seq_id
=
seq_id
self
.
inputs
=
SingletonInputsAdapter
(
inputs
)
self
.
inputs
=
inputs
self
.
block_size
=
block_size
self
.
block_size
=
block_size
self
.
eos_token_id
=
eos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
lora_request
=
lora_request
self
.
lora_request
=
lora_request
...
@@ -448,31 +448,29 @@ class Sequence:
...
@@ -448,31 +448,29 @@ class Sequence:
@
property
@
property
def
prompt
(
self
)
->
Optional
[
str
]:
def
prompt
(
self
)
->
Optional
[
str
]:
return
self
.
inputs
.
prompt
return
self
.
inputs
.
get
(
"
prompt
"
)
@
property
@
property
def
prompt_token_ids
(
self
)
->
list
[
int
]:
def
prompt_token_ids
(
self
)
->
list
[
int
]:
return
self
.
inputs
.
prompt_token_ids
return
self
.
inputs
[
"prompt_token_ids"
]
@
property
def
prompt_embeds
(
self
)
->
Optional
[
torch
.
Tensor
]:
return
self
.
inputs
.
prompt_embeds
@
property
@
property
def
token_type_ids
(
self
)
->
list
[
int
]:
def
token_type_ids
(
self
)
->
list
[
int
]:
return
self
.
inputs
.
token_type_ids
return
self
.
inputs
.
get
(
"
token_type_ids
"
,
[])
@
property
@
property
def
multi_modal_data
(
self
)
->
"MultiModalDataDict"
:
def
multi_modal_data
(
self
)
->
MultiModalKwargs
:
return
self
.
inputs
.
multi_modal_data
if
self
.
inputs
[
"type"
]
==
"multimodal"
:
return
self
.
inputs
[
"mm_kwargs"
]
return
MultiModalKwargs
({})
@
property
@
property
def
multi_modal_placeholders
(
self
)
->
MultiModalPlaceholderDict
:
def
multi_modal_placeholders
(
self
)
->
MultiModalPlaceholderDict
:
return
self
.
inputs
.
multi_modal_placeholders
if
self
.
inputs
[
"type"
]
==
"multimodal"
:
return
self
.
inputs
[
"mm_placeholders"
]
@
property
return
{}
def
mm_processor_kwargs
(
self
)
->
dict
[
str
,
Any
]:
return
self
.
inputs
.
mm_processor_kwargs
@
property
@
property
def
lora_int_id
(
self
)
->
int
:
def
lora_int_id
(
self
)
->
int
:
...
@@ -723,12 +721,12 @@ class SequenceGroup:
...
@@ -723,12 +721,12 @@ class SequenceGroup:
return
self
.
first_seq
.
token_type_ids
return
self
.
first_seq
.
token_type_ids
@
property
@
property
def
multi_modal_data
(
self
)
->
MultiModal
DataDict
:
def
multi_modal_data
(
self
)
->
MultiModal
Kwargs
:
if
self
.
first_seq
.
multi_modal_data
:
if
self
.
first_seq
.
multi_modal_data
:
return
self
.
first_seq
.
multi_modal_data
return
self
.
first_seq
.
multi_modal_data
elif
self
.
encoder_seq
is
not
None
:
elif
self
.
encoder_seq
is
not
None
:
return
self
.
encoder_seq
.
multi_modal_data
return
self
.
encoder_seq
.
multi_modal_data
return
{}
return
MultiModalKwargs
(
{}
)
@
property
@
property
def
multi_modal_placeholders
(
self
)
->
MultiModalPlaceholderDict
:
def
multi_modal_placeholders
(
self
)
->
MultiModalPlaceholderDict
:
...
@@ -738,14 +736,6 @@ class SequenceGroup:
...
@@ -738,14 +736,6 @@ class SequenceGroup:
return
self
.
encoder_seq
.
multi_modal_placeholders
return
self
.
encoder_seq
.
multi_modal_placeholders
return
{}
return
{}
@
property
def
mm_processor_kwargs
(
self
)
->
dict
[
str
,
Any
]:
if
self
.
first_seq
.
multi_modal_data
:
return
self
.
first_seq
.
mm_processor_kwargs
elif
self
.
encoder_seq
is
not
None
:
return
self
.
encoder_seq
.
mm_processor_kwargs
return
{}
@
property
@
property
def
lora_int_id
(
self
)
->
int
:
def
lora_int_id
(
self
)
->
int
:
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
return
self
.
lora_request
.
lora_int_id
if
self
.
lora_request
else
0
...
@@ -969,12 +959,9 @@ class SequenceGroupMetadata(
...
@@ -969,12 +959,9 @@ class SequenceGroupMetadata(
computed_block_nums
:
Optional
[
list
[
int
]]
=
None
computed_block_nums
:
Optional
[
list
[
int
]]
=
None
state
:
Optional
[
SequenceGroupState
]
=
msgspec
.
field
(
state
:
Optional
[
SequenceGroupState
]
=
msgspec
.
field
(
default_factory
=
lambda
:
SequenceGroupState
())
default_factory
=
lambda
:
SequenceGroupState
())
# "MultiModalDataDict" types. We have to use Any due to msgspec
# doesn't allow to have union of 2 different dicts.
token_type_ids
:
Optional
[
list
[
int
]]
=
None
token_type_ids
:
Optional
[
list
[
int
]]
=
None
multi_modal_data
:
Optional
[
Any
]
=
None
multi_modal_data
:
Optional
[
MultiModalKwargs
]
=
None
multi_modal_placeholders
:
Optional
[
MultiModalPlaceholderDict
]
=
None
multi_modal_placeholders
:
Optional
[
MultiModalPlaceholderDict
]
=
None
mm_processor_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
encoder_seq_data
:
Optional
[
SequenceData
]
=
None
encoder_seq_data
:
Optional
[
SequenceData
]
=
None
cross_block_table
:
Optional
[
list
[
int
]]
=
None
cross_block_table
:
Optional
[
list
[
int
]]
=
None
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
...
...
vllm/transformers_utils/processor.py
View file @
aec9674d
...
@@ -208,38 +208,3 @@ def cached_image_processor_from_config(
...
@@ -208,38 +208,3 @@ def cached_image_processor_from_config(
trust_remote_code
=
model_config
.
trust_remote_code
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
_merge_mm_kwargs
(
model_config
,
**
kwargs
),
**
_merge_mm_kwargs
(
model_config
,
**
kwargs
),
)
)
def
get_video_processor
(
processor_name
:
str
,
*
args
:
Any
,
trust_remote_code
:
bool
=
False
,
**
kwargs
:
Any
,
):
"""Load a video processor for the given model name via HuggingFace."""
# don't put this import at the top level
# it will call torch.cuda.device_count()
from
transformers.image_processing_utils
import
BaseImageProcessor
processor
=
get_processor
(
processor_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
**
kwargs
,
)
return
cast
(
BaseImageProcessor
,
processor
.
video_processor
)
cached_get_video_processor
=
lru_cache
(
get_video_processor
)
def
cached_video_processor_from_config
(
model_config
:
"ModelConfig"
,
**
kwargs
:
Any
,
):
return
cached_get_video_processor
(
model_config
.
model
,
trust_remote_code
=
model_config
.
trust_remote_code
,
**
_merge_mm_kwargs
(
model_config
,
**
kwargs
),
)
vllm/worker/cpu_model_runner.py
View file @
aec9674d
...
@@ -22,8 +22,8 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
...
@@ -22,8 +22,8 @@ from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.models
import
supports_lora
,
supports_multimodal
from
vllm.model_executor.models
import
supports_lora
,
supports_multimodal
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
(
BatchedTensorInputs
,
MultiModalKwargs
,
MultiModalKwargs
,
MultiModalPlaceholderMap
)
MultiModalPlaceholderMap
)
from
vllm.sequence
import
(
IntermediateTensors
,
SequenceData
,
from
vllm.sequence
import
(
IntermediateTensors
,
SequenceData
,
SequenceGroupMetadata
)
SequenceGroupMetadata
)
from
vllm.worker.model_runner_base
import
(
from
vllm.worker.model_runner_base
import
(
...
@@ -154,7 +154,6 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
...
@@ -154,7 +154,6 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
self
.
sliding_window
=
self
.
runner
.
sliding_window
self
.
sliding_window
=
self
.
runner
.
sliding_window
self
.
block_size
=
self
.
runner
.
block_size
self
.
block_size
=
self
.
runner
.
block_size
self
.
device
=
self
.
runner
.
device
self
.
device
=
self
.
runner
.
device
self
.
multi_modal_input_mapper
=
self
.
runner
.
multi_modal_input_mapper
self
.
enable_lora
=
self
.
runner
.
lora_config
is
not
None
self
.
enable_lora
=
self
.
runner
.
lora_config
is
not
None
if
self
.
runner
.
attn_backend
is
not
None
:
if
self
.
runner
.
attn_backend
is
not
None
:
# spec decode (e.g. Medusa) does not have atten backend
# spec decode (e.g. Medusa) does not have atten backend
...
@@ -359,22 +358,14 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
...
@@ -359,22 +358,14 @@ class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]):
computed_len
=
seq_data
.
get_num_computed_tokens
()
computed_len
=
seq_data
.
get_num_computed_tokens
()
seq_len
=
self
.
input_data
.
seq_lens
[
-
1
]
seq_len
=
self
.
input_data
.
seq_lens
[
-
1
]
# NOTE: mm_
data
only includes the subset of multi-modal items that
# NOTE: mm_
kwargs
only includes the subset of multi-modal items that
# intersect with the current prefill positions.
# intersect with the current prefill positions.
mm_
data
,
placeholder_maps
=
MultiModalPlaceholderMap
.
from_seq_group
(
mm_
kwargs
,
placeholder_maps
=
MultiModalPlaceholderMap
.
from_seq_group
(
seq_group_metadata
,
range
(
computed_len
,
seq_len
))
seq_group_metadata
,
range
(
computed_len
,
seq_len
))
if
not
mm_
data
:
if
not
mm_
kwargs
:
return
return
if
self
.
runner
.
mm_registry
.
has_processor
(
self
.
runner
.
model_config
):
mm_kwargs
=
mm_data
else
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
,
seq_group_metadata
.
mm_processor_kwargs
,
)
# special processing for mrope position deltas.
# special processing for mrope position deltas.
if
self
.
runner
.
model_config
.
uses_mrope
:
if
self
.
runner
.
model_config
.
uses_mrope
:
assert
not
self
.
chunked_prefill
,
\
assert
not
self
.
chunked_prefill
,
\
...
@@ -480,12 +471,6 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
...
@@ -480,12 +471,6 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
use_mla
=
self
.
model_config
.
use_mla
,
use_mla
=
self
.
model_config
.
use_mla
,
)
if
needs_attn_backend
else
None
)
if
needs_attn_backend
else
None
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
# Lazy initialization.
# Lazy initialization.
self
.
model
:
nn
.
Module
# Set after init_Model
self
.
model
:
nn
.
Module
# Set after init_Model
# Set after load_model.
# Set after load_model.
...
...
vllm/worker/enc_dec_model_runner.py
View file @
aec9674d
...
@@ -100,6 +100,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
...
@@ -100,6 +100,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
kv_cache_dtype
=
kv_cache_dtype
,
kv_cache_dtype
=
kv_cache_dtype
,
is_driver_worker
=
is_driver_worker
,
is_driver_worker
=
is_driver_worker
,
input_registry
=
input_registry
,
mm_registry
=
mm_registry
,
)
)
# Crash for unsupported encoder/scenarios
# Crash for unsupported encoder/scenarios
...
...
vllm/worker/hpu_model_runner.py
View file @
aec9674d
...
@@ -45,8 +45,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -45,8 +45,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.sampling_metadata
import
SequenceGroupToSample
from
vllm.model_executor.sampling_metadata
import
SequenceGroupToSample
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
BatchedTensorInputs
,
MultiModalKwargs
MultiModalKwargs
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
IntermediateTensors
,
from
vllm.sequence
import
(
CompletionSequenceGroupOutput
,
IntermediateTensors
,
Logprob
,
SequenceData
,
SequenceGroupMetadata
,
Logprob
,
SequenceData
,
SequenceGroupMetadata
,
...
@@ -545,10 +544,6 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
...
@@ -545,10 +544,6 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
]
]
gc
.
set_threshold
(
*
requested_gc_thrs
)
gc
.
set_threshold
(
*
requested_gc_thrs
)
# Multi-modal data support
self
.
multi_modal_input_mapper
=
MULTIMODAL_REGISTRY
\
.
create_input_mapper
(
self
.
model_config
)
self
.
skip_warmup
=
os
.
environ
.
get
(
'VLLM_SKIP_WARMUP'
,
self
.
skip_warmup
=
os
.
environ
.
get
(
'VLLM_SKIP_WARMUP'
,
'false'
).
lower
()
==
'true'
'false'
).
lower
()
==
'true'
...
@@ -731,9 +726,8 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
...
@@ -731,9 +726,8 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]):
# is always the first token in the sequence.
# is always the first token in the sequence.
input_positions
.
append
(
list
(
range
(
context_len
,
seq_len
)))
input_positions
.
append
(
list
(
range
(
context_len
,
seq_len
)))
mm_data
=
seq_group_metadata
.
multi_modal_data
mm_kwargs
=
seq_group_metadata
.
multi_modal_data
if
mm_data
:
if
mm_kwargs
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
)
multi_modal_kwargs_list
.
append
(
mm_kwargs
)
multi_modal_kwargs_list
.
append
(
mm_kwargs
)
if
seq_group_metadata
.
block_tables
is
None
:
if
seq_group_metadata
.
block_tables
is
None
:
...
...
vllm/worker/model_runner.py
View file @
aec9674d
...
@@ -457,7 +457,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -457,7 +457,6 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
self
.
enable_lora
=
self
.
runner
.
lora_config
is
not
None
self
.
enable_lora
=
self
.
runner
.
lora_config
is
not
None
self
.
enable_prompt_adapter
=
(
self
.
runner
.
prompt_adapter_config
self
.
enable_prompt_adapter
=
(
self
.
runner
.
prompt_adapter_config
is
not
None
)
is
not
None
)
self
.
multi_modal_input_mapper
=
self
.
runner
.
multi_modal_input_mapper
# Attention metadata inputs.
# Attention metadata inputs.
if
self
.
attn_backend
is
not
None
:
if
self
.
attn_backend
is
not
None
:
...
@@ -675,23 +674,15 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
...
@@ -675,23 +674,15 @@ class ModelInputForGPUBuilder(ModelRunnerInputBuilderBase[ModelInputForGPU]):
def
_compute_multi_modal_input
(
self
,
inter_data
:
InterDataForSeqGroup
,
def
_compute_multi_modal_input
(
self
,
inter_data
:
InterDataForSeqGroup
,
seq_group_metadata
:
SequenceGroupMetadata
):
seq_group_metadata
:
SequenceGroupMetadata
):
"""If multi-modal data is given, add it to the input."""
"""If multi-modal data is given, add it to the input."""
# NOTE: mm_
data
only includes the subset of multi-modal items that
# NOTE: mm_
kwargs
only includes the subset of multi-modal items that
# intersect with the current prefill positions.
# intersect with the current prefill positions.
positions
=
inter_data
.
input_positions
[
0
]
positions
=
inter_data
.
input_positions
[
0
]
mm_
data
,
placeholder_maps
=
MultiModalPlaceholderMap
.
from_seq_group
(
mm_
kwargs
,
placeholder_maps
=
MultiModalPlaceholderMap
.
from_seq_group
(
seq_group_metadata
,
seq_group_metadata
,
range
(
positions
[
0
],
positions
[
0
]
+
len
(
positions
)))
range
(
positions
[
0
],
positions
[
0
]
+
len
(
positions
)))
if
not
mm_
data
:
if
not
mm_
kwargs
:
return
return
if
self
.
runner
.
mm_registry
.
has_processor
(
self
.
runner
.
model_config
):
mm_kwargs
=
mm_data
else
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
,
seq_group_metadata
.
mm_processor_kwargs
,
)
inter_data
.
multi_modal_kwargs
=
mm_kwargs
inter_data
.
multi_modal_kwargs
=
mm_kwargs
inter_data
.
multi_modal_placeholder_maps
=
placeholder_maps
inter_data
.
multi_modal_placeholder_maps
=
placeholder_maps
...
@@ -1085,9 +1076,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1085,9 +1076,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
# Multi-modal data support
# Multi-modal data support
self
.
input_registry
=
input_registry
self
.
input_registry
=
input_registry
self
.
mm_registry
=
mm_registry
self
.
mm_registry
=
mm_registry
self
.
multi_modal_input_mapper
=
mm_registry
\
.
create_input_mapper
(
model_config
)
self
.
mm_registry
.
init_mm_limits_per_prompt
(
self
.
model_config
)
# Lazy initialization
# Lazy initialization
self
.
model
:
nn
.
Module
# Set after load_model
self
.
model
:
nn
.
Module
# Set after load_model
...
@@ -1327,8 +1315,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
...
@@ -1327,8 +1315,8 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
dummy_data
=
self
.
input_registry
\
dummy_data
=
self
.
input_registry
\
.
dummy_data_for_profiling
(
self
.
model_config
,
.
dummy_data_for_profiling
(
self
.
model_config
,
seq_len
,
seq_len
,
self
.
mm_registry
)
self
.
mm_registry
)
seq
=
SequenceGroupMetadata
(
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
request_id
=
str
(
group_id
),
...
...
vllm/worker/neuron_model_runner.py
View file @
aec9674d
...
@@ -15,8 +15,7 @@ from vllm.logger import init_logger
...
@@ -15,8 +15,7 @@ from vllm.logger import init_logger
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader.neuron
import
get_neuron_model
from
vllm.model_executor.model_loader.neuron
import
get_neuron_model
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
BatchedTensorInputs
,
from
vllm.multimodal
import
BatchedTensorInputs
,
MultiModalKwargs
MultiModalKwargs
)
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.sequence
import
IntermediateTensors
,
SequenceGroupMetadata
from
vllm.utils
import
is_pin_memory_available
,
make_tensor_with_pad
from
vllm.utils
import
is_pin_memory_available
,
make_tensor_with_pad
from
vllm.worker.model_runner_base
import
ModelRunnerBase
,
ModelRunnerInputBase
from
vllm.worker.model_runner_base
import
ModelRunnerBase
,
ModelRunnerInputBase
...
@@ -69,11 +68,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
...
@@ -69,11 +68,6 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
self
.
device
=
self
.
device_config
.
device
self
.
device
=
self
.
device_config
.
device
self
.
pin_memory
=
is_pin_memory_available
()
self
.
pin_memory
=
is_pin_memory_available
()
# Multi-modal data support
self
.
mm_registry
=
MULTIMODAL_REGISTRY
self
.
multi_modal_input_mapper
=
self
.
mm_registry
\
.
create_input_mapper
(
self
.
model_config
)
# Lazy initialization.
# Lazy initialization.
self
.
model
:
nn
.
Module
# initialize after load_model.
self
.
model
:
nn
.
Module
# initialize after load_model.
...
@@ -149,16 +143,8 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
...
@@ -149,16 +143,8 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
assert
len
(
block_table
)
==
1
assert
len
(
block_table
)
==
1
input_block_ids
.
append
(
block_table
[
0
])
input_block_ids
.
append
(
block_table
[
0
])
mm_data
=
seq_group_metadata
.
multi_modal_data
mm_kwargs
=
seq_group_metadata
.
multi_modal_data
if
mm_data
:
if
mm_kwargs
:
if
self
.
mm_registry
.
has_processor
(
self
.
model_config
):
mm_kwargs
=
mm_data
else
:
mm_kwargs
=
self
.
multi_modal_input_mapper
(
mm_data
,
seq_group_metadata
.
mm_processor_kwargs
,
)
multi_modal_kwargs_list
.
append
(
mm_kwargs
)
multi_modal_kwargs_list
.
append
(
mm_kwargs
)
max_seq_len
=
max
(
seq_lens
)
max_seq_len
=
max
(
seq_lens
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment