Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0b8bb86b
Unverified
Commit
0b8bb86b
authored
Nov 13, 2024
by
Cyrus Leung
Committed by
GitHub
Nov 13, 2024
Browse files
[1/N] Initial prototype for multi-modal processor (#10044)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
bb7991aa
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
329 additions
and
87 deletions
+329
-87
docs/source/models/enabling_multimodal_inputs.rst
docs/source/models/enabling_multimodal_inputs.rst
+1
-1
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
...der_only/vision_language/mm_processor_kwargs/test_qwen.py
+1
-1
tests/multimodal/test_inputs.py
tests/multimodal/test_inputs.py
+1
-1
tests/multimodal/test_processor_kwargs.py
tests/multimodal/test_processor_kwargs.py
+23
-14
tests/v1/core/test_prefix_caching.py
tests/v1/core/test_prefix_caching.py
+2
-2
vllm/config.py
vllm/config.py
+1
-1
vllm/engine/async_llm_engine.py
vllm/engine/async_llm_engine.py
+4
-0
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+6
-10
vllm/engine/multiprocessing/client.py
vllm/engine/multiprocessing/client.py
+6
-0
vllm/engine/protocol.py
vllm/engine/protocol.py
+11
-5
vllm/entrypoints/openai/serving_chat.py
vllm/entrypoints/openai/serving_chat.py
+0
-1
vllm/entrypoints/openai/serving_completion.py
vllm/entrypoints/openai/serving_completion.py
+0
-1
vllm/inputs/__init__.py
vllm/inputs/__init__.py
+8
-4
vllm/inputs/data.py
vllm/inputs/data.py
+88
-11
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+122
-21
vllm/inputs/registry.py
vllm/inputs/registry.py
+50
-6
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+2
-2
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+1
-2
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+1
-2
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+1
-2
No files found.
docs/source/models/enabling_multimodal_inputs.rst
View file @
0b8bb86b
...
...
@@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i
3. Register maximum number of multi-modal tokens
------------------------------------------------
For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data i
nstance
For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data i
tem
and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
.. code-block:: diff
...
...
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
View file @
0b8bb86b
...
...
@@ -6,7 +6,7 @@ import torch
from
PIL.Image
import
Image
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.multimodal
.base
import
MultiModalKwargs
from
vllm.multimodal
import
MultiModalKwargs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
.....conftest
import
IMAGE_ASSETS
...
...
tests/multimodal/test_
base
.py
→
tests/multimodal/test_
inputs
.py
View file @
0b8bb86b
import
torch
from
vllm.multimodal.
base
import
MultiModalKwargs
,
NestedTensors
from
vllm.multimodal.
inputs
import
MultiModalKwargs
,
NestedTensors
def
assert_nested_tensors_equal
(
expected
:
NestedTensors
,
...
...
tests/multimodal/test_processor_kwargs.py
View file @
0b8bb86b
from
array
import
array
from
typing
import
Mapping
from
typing
import
Callable
,
Dict
,
Mapping
,
Optional
from
unittest.mock
import
patch
import
pytest
import
torch
from
vllm.inputs
import
(
DecoderOnlyInputs
,
DummyData
,
InputContext
,
InputRegistry
,
token_inputs
)
InputRegistry
,
ProcessorInputs
,
token_inputs
)
from
vllm.multimodal
import
MultiModalRegistry
from
vllm.sequence
import
VLLM_TOKEN_ID_ARRAY_TYPE
,
SequenceData
...
...
@@ -34,10 +34,9 @@ def use_processor_mock():
inputs
:
DecoderOnlyInputs
,
*
,
num_crops
=
DEFAULT_NUM_CROPS
):
# For testing purposes, we don't worry about the llm inputs / return
# type validation, and just return the value of the kwarg that we
# clobber.
return
num_crops
# For testing purposes, we don't worry about the prompt
return
token_inputs
(
prompt_token_ids
=
[],
mm_processor_kwargs
=
{
"num_crops"
:
num_crops
})
with
patch
(
"vllm.inputs.registry.InputRegistry._get_model_input_processor"
,
return_value
=
custom_processor
):
...
...
@@ -109,6 +108,21 @@ def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
return
init_kwargs
,
inference_kwargs
,
expected_seq_count
def
_get_processed_num_crops
(
processor
:
Callable
[[
ProcessorInputs
],
ProcessorInputs
],
inference_kwargs
:
Optional
[
Dict
[
str
,
int
]],
)
->
int
:
processed_inputs
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
inference_kwargs
))
assert
"type"
in
processed_inputs
assert
processed_inputs
[
"type"
]
==
"token"
assert
"mm_processor_kwargs"
in
processed_inputs
return
processed_inputs
[
"mm_processor_kwargs"
][
"num_crops"
]
@
pytest
.
mark
.
parametrize
(
"init_num_crops,inference_num_crops"
,
[
(
None
,
None
),
(
NUM_CROPS_OVERRIDE
,
None
),
...
...
@@ -124,10 +138,8 @@ def test_input_processor_kwargs(use_processor_mock, init_num_crops,
ctx
=
build_model_context
(
DUMMY_MODEL_ID
,
mm_processor_kwargs
=
init_kwargs
)
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
inference_kwargs
))
num_crops_val
=
_get_processed_num_crops
(
processor
,
inference_kwargs
)
assert
num_crops_val
==
expected_seq_count
...
...
@@ -153,10 +165,7 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
processor
=
dummy_registry
.
create_input_processor
(
ctx
.
model_config
)
# Should filter out the inference time kwargs
num_crops_val
=
processor
(
token_inputs
(
prompt_token_ids
=
[],
prompt
=
""
,
mm_processor_kwargs
=
mm_processor_kwargs
))
num_crops_val
=
_get_processed_num_crops
(
processor
,
mm_processor_kwargs
)
assert
num_crops_val
==
DEFAULT_NUM_CROPS
...
...
tests/v1/core/test_prefix_caching.py
View file @
0b8bb86b
"""Compare the with and without prefix caching."""
from
vllm.inputs
import
DecoderOnlyI
nputs
from
vllm.inputs
import
token_i
nputs
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.core.kv_cache_manager
import
KVCacheManager
,
Request
from
vllm.v1.core.kv_cache_utils
import
hash_block_tokens
...
...
@@ -8,7 +8,7 @@ from vllm.v1.core.kv_cache_utils import hash_block_tokens
def
make_request
(
request_id
,
prompt_token_ids
):
return
Request
(
request_id
=
request_id
,
inputs
=
DecoderOnlyI
nputs
(
prompt_token_ids
=
prompt_token_ids
),
inputs
=
token_i
nputs
(
prompt_token_ids
=
prompt_token_ids
),
sampling_params
=
SamplingParams
(
max_tokens
=
17
),
eos_token_id
=
100
,
arrival_time
=
0
,
...
...
vllm/config.py
View file @
0b8bb86b
...
...
@@ -107,7 +107,7 @@ class ModelConfig:
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
limit_mm_per_prompt: Maximum number of data i
nstance
s per modality
limit_mm_per_prompt: Maximum number of data i
tem
s per modality
per prompt. Only applicable for multimodal models.
override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices,
...
...
vllm/engine/async_llm_engine.py
View file @
0b8bb86b
...
...
@@ -19,6 +19,7 @@ from vllm.executor.executor_base import ExecutorAsyncBase
from
vllm.executor.gpu_executor
import
GPUExecutorAsync
from
vllm.executor.ray_utils
import
initialize_ray_cluster
from
vllm.inputs
import
PromptType
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.guided_decoding
import
(
...
...
@@ -729,6 +730,9 @@ class AsyncLLMEngine(EngineClient):
self
.
set_errored
(
exc
)
self
.
_request_tracker
.
propagate_exception
(
exc
)
async
def
get_input_preprocessor
(
self
)
->
InputPreprocessor
:
return
self
.
engine
.
input_preprocessor
async
def
get_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
...
...
vllm/engine/llm_engine.py
View file @
0b8bb86b
...
...
@@ -30,7 +30,7 @@ from vllm.executor.executor_base import ExecutorBase
from
vllm.executor.gpu_executor
import
GPUExecutor
from
vllm.executor.ray_utils
import
initialize_ray_cluster
from
vllm.inputs
import
(
INPUT_REGISTRY
,
InputRegistry
,
ProcessorInputs
,
PromptType
)
PromptType
,
SingletonInputsAdapter
)
from
vllm.inputs.parse
import
is_encoder_decoder_inputs
,
is_token_prompt
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
...
...
@@ -39,6 +39,7 @@ from vllm.lora.request import LoRARequest
from
vllm.model_executor.guided_decoding
import
(
get_local_guided_decoding_logits_processor
)
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.outputs
import
(
EmbeddingRequestOutput
,
RequestOutput
,
RequestOutputFactory
)
from
vllm.pooling_params
import
PoolingParams
...
...
@@ -226,6 +227,7 @@ class LLMEngine:
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
input_registry
:
InputRegistry
=
INPUT_REGISTRY
,
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
use_cached_outputs
:
bool
=
False
,
)
->
None
:
...
...
@@ -335,7 +337,8 @@ class LLMEngine:
model_config
)
self
.
input_preprocessor
=
InputPreprocessor
(
model_config
,
self
.
tokenizer
)
self
.
tokenizer
,
mm_registry
)
self
.
input_registry
=
input_registry
self
.
input_processor
=
input_registry
.
create_input_processor
(
...
...
@@ -851,13 +854,6 @@ class LLMEngine:
)
processed_inputs
=
self
.
input_processor
(
preprocessed_inputs
)
# This is a bit of a hack - copy the mm_processor_kwargs that were
# used in the input processor to the processed output, since these
# kwargs are presumed to be immutable and the values should be aligned
# between the input processor (here) and the input mapper.
processed_inputs
[
"mm_processor_kwargs"
]
=
preprocessed_inputs
.
get
(
"mm_processor_kwargs"
)
self
.
_add_processed_request
(
request_id
=
request_id
,
processed_inputs
=
processed_inputs
,
...
...
@@ -2019,7 +2015,7 @@ class LLMEngine:
else
:
prompt_inputs
=
inputs
prompt_ids
=
prompt_inputs
.
get
(
"
prompt_token_ids
"
)
prompt_ids
=
SingletonInputsAdapter
(
prompt_inputs
)
.
prompt_token_ids
if
prompt_ids
is
None
or
len
(
prompt_ids
)
==
0
:
raise
ValueError
(
"Prompt cannot be empty"
)
...
...
vllm/engine/multiprocessing/client.py
View file @
0b8bb86b
...
...
@@ -31,6 +31,7 @@ from vllm.engine.protocol import EngineClient
# yapf: enable
from
vllm.envs
import
VLLM_RPC_TIMEOUT
from
vllm.inputs
import
PromptType
from
vllm.inputs.preprocess
import
InputPreprocessor
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.layers.sampler
import
SamplerOutput
...
...
@@ -94,6 +95,8 @@ class MQLLMEngineClient(EngineClient):
parallel_config
=
engine_config
.
parallel_config
,
enable_lora
=
bool
(
engine_config
.
lora_config
),
)
self
.
input_preprocessor
=
InputPreprocessor
(
self
.
model_config
,
self
.
tokenizer
)
# Send RPCGenerateRequest to the MQLLMEngine.
self
.
input_socket
:
Socket
=
self
.
context
.
socket
(
zmq
.
constants
.
PUSH
)
...
...
@@ -345,6 +348,9 @@ class MQLLMEngineClient(EngineClient):
or
response
!=
VLLM_RPC_SUCCESS_STR
):
raise
ValueError
(
error_message
)
async
def
get_input_preprocessor
(
self
)
->
InputPreprocessor
:
return
self
.
input_preprocessor
async
def
get_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
):
return
await
self
.
tokenizer
.
get_lora_tokenizer_async
(
lora_request
)
...
...
vllm/engine/protocol.py
View file @
0b8bb86b
...
...
@@ -62,7 +62,6 @@ class EngineClient(ABC):
async
def
beam_search
(
self
,
prompt
:
PromptType
,
model_config
:
ModelConfig
,
request_id
:
str
,
params
:
BeamSearchParams
,
)
->
AsyncGenerator
[
RequestOutput
,
None
]:
...
...
@@ -74,13 +73,14 @@ class EngineClient(ABC):
length_penalty
=
params
.
length_penalty
include_stop_str_in_output
=
params
.
include_stop_str_in_output
tokenizer
=
await
self
.
get_tokenizer
()
input_preprocessor
=
InputPreprocessor
(
model_config
,
tokenizer
)
preprocessor
=
await
self
.
get_input_preprocessor
()
tokenizer_group
=
preprocessor
.
get_tokenizer_group
()
tokenizer
=
await
tokenizer_group
.
get_lora_tokenizer_async
()
if
is_explicit_encoder_decoder_prompt
(
prompt
):
raise
NotImplementedError
else
:
processed_inputs
=
input_
preprocessor
.
_prompt_to_llm_inputs
(
processed_inputs
=
preprocessor
.
_prompt_to_llm_inputs
(
prompt
,
request_id
=
request_id
,
)
...
...
@@ -220,6 +220,7 @@ class EngineClient(ABC):
Args:
request_id: The unique id of the request.
"""
...
@
abstractmethod
async
def
get_model_config
(
self
)
->
ModelConfig
:
...
...
@@ -228,8 +229,13 @@ class EngineClient(ABC):
@
abstractmethod
async
def
get_decoding_config
(
self
)
->
DecodingConfig
:
...
"""Get the decoding configuration of the vLLM engine."""
...
@
abstractmethod
async
def
get_input_preprocessor
(
self
)
->
InputPreprocessor
:
"""Get the input processor of the vLLM engine."""
...
@
abstractmethod
async
def
get_tokenizer
(
...
...
vllm/entrypoints/openai/serving_chat.py
View file @
0b8bb86b
...
...
@@ -190,7 +190,6 @@ class OpenAIServingChat(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
engine_client
.
beam_search
(
prompt
=
engine_prompt
,
model_config
=
self
.
model_config
,
request_id
=
request_id
,
params
=
sampling_params
,
)
...
...
vllm/entrypoints/openai/serving_completion.py
View file @
0b8bb86b
...
...
@@ -140,7 +140,6 @@ class OpenAIServingCompletion(OpenAIServing):
if
isinstance
(
sampling_params
,
BeamSearchParams
):
generator
=
self
.
engine_client
.
beam_search
(
prompt
=
engine_prompt
,
model_config
=
self
.
model_config
,
request_id
=
request_id
,
params
=
sampling_params
,
)
...
...
vllm/inputs/__init__.py
View file @
0b8bb86b
from
.data
import
(
DecoderOnlyInputs
,
EncoderDecoderInputs
,
ExplicitEncoderDecoderPrompt
,
ProcessorInputs
,
PromptType
,
SingletonInputs
,
SingletonPrompt
,
TextPrompt
,
TokenInputs
,
TokensPrompt
,
build_explicit_enc_dec_prompt
,
to_enc_dec_tuple_list
,
token_inputs
,
zip_enc_dec_prompts
)
from
.registry
import
DummyData
,
InputContext
,
InputRegistry
SingletonInputs
,
SingletonInputsAdapter
,
SingletonPrompt
,
TextPrompt
,
TokenInputs
,
TokensPrompt
,
build_explicit_enc_dec_prompt
,
to_enc_dec_tuple_list
,
token_inputs
,
zip_enc_dec_prompts
)
from
.registry
import
(
DummyData
,
InputContext
,
InputProcessingContext
,
InputRegistry
)
INPUT_REGISTRY
=
InputRegistry
()
"""
...
...
@@ -26,12 +28,14 @@ __all__ = [
"EncoderDecoderInputs"
,
"ProcessorInputs"
,
"SingletonInputs"
,
"SingletonInputsAdapter"
,
"build_explicit_enc_dec_prompt"
,
"to_enc_dec_tuple_list"
,
"zip_enc_dec_prompts"
,
"INPUT_REGISTRY"
,
"DummyData"
,
"InputContext"
,
"InputProcessingContext"
,
"InputRegistry"
,
]
...
...
vllm/inputs/data.py
View file @
0b8bb86b
from
dataclasses
import
dataclass
from
functools
import
cached_property
from
typing
import
(
TYPE_CHECKING
,
Any
,
Dict
,
Generic
,
Iterable
,
List
,
Literal
,
Optional
,
Tuple
,
Union
,
cast
)
from
typing_extensions
import
NotRequired
,
TypedDict
,
TypeVar
import
torch
from
typing_extensions
import
NotRequired
,
TypedDict
,
TypeVar
,
assert_never
if
TYPE_CHECKING
:
from
vllm.multimodal
import
MultiModalDataDict
,
MultiModalPlaceholderDict
from
vllm.multimodal.inputs
import
MultiModalInputsV2
class
TextPrompt
(
TypedDict
):
...
...
@@ -36,13 +40,13 @@ class TokensPrompt(TypedDict):
multi_modal_data
:
NotRequired
[
"MultiModalDataDict"
]
"""
Optional multi-modal data to pass to the model,
DEPRECATED:
Optional multi-modal data to pass to the model,
if the model supports it.
"""
mm_processor_kwargs
:
NotRequired
[
Dict
[
str
,
Any
]]
"""
Optional multi-modal processor kwargs to be forwarded to the
DEPRECATED:
Optional multi-modal processor kwargs to be forwarded to the
multimodal input mapper & processor. Note that if multiple modalities
have registered mappers etc for the model being considered, we attempt
to pass the mm_processor_kwargs to each of them.
...
...
@@ -176,7 +180,7 @@ def token_inputs(
return
inputs
DecoderOnlyInputs
=
TokenInputs
DecoderOnlyInputs
=
Union
[
TokenInputs
,
"MultiModalInputsV2"
]
"""
The inputs in :class:`~vllm.LLMEngine` before they are
passed to the model executor.
...
...
@@ -191,19 +195,91 @@ class EncoderDecoderInputs(TypedDict):
This specifies the required data for encoder-decoder models.
"""
encoder
:
TokenInputs
encoder
:
Union
[
TokenInputs
,
"MultiModalInputsV2"
]
"""The inputs for the encoder portion."""
decoder
:
TokenInputs
decoder
:
Union
[
TokenInputs
,
"MultiModalInputsV2"
]
"""The inputs for the decoder portion."""
SingletonInputs
=
TokenInputs
SingletonInputs
=
Union
[
TokenInputs
,
"MultiModalInputsV2"
]
"""
A processed :class:`SingletonPrompt` which can be passed to
:class:`vllm.sequence.Sequence`.
"""
@
dataclass
class
SingletonInputsAdapter
:
"""
Unified interface to access the components of :class:`SingletonInputs`.
"""
inputs
:
SingletonInputs
@
cached_property
def
prompt
(
self
)
->
Optional
[
str
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"prompt"
)
assert_never
(
inputs
)
@
cached_property
def
prompt_token_ids
(
self
)
->
List
[
int
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"prompt_token_ids"
,
[])
assert_never
(
inputs
)
@
cached_property
def
prompt_embeds
(
self
)
->
Optional
[
torch
.
Tensor
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
or
inputs
[
"type"
]
==
"multimodal"
:
return
None
assert_never
(
inputs
)
@
cached_property
def
multi_modal_data
(
self
)
->
"MultiModalDataDict"
:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_data"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"mm_kwargs"
,
{})
assert_never
(
inputs
)
@
cached_property
def
multi_modal_placeholders
(
self
)
->
"MultiModalPlaceholderDict"
:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"multi_modal_placeholders"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
inputs
.
get
(
"mm_placeholders"
,
{})
assert_never
(
inputs
)
@
cached_property
def
mm_processor_kwargs
(
self
)
->
Dict
[
str
,
Any
]:
inputs
=
self
.
inputs
if
inputs
[
"type"
]
==
"token"
:
return
inputs
.
get
(
"mm_processor_kwargs"
,
{})
if
inputs
[
"type"
]
==
"multimodal"
:
return
{}
assert_never
(
inputs
)
ProcessorInputs
=
Union
[
DecoderOnlyInputs
,
EncoderDecoderInputs
]
"""
The inputs to :data:`vllm.inputs.InputProcessor`.
...
...
@@ -234,10 +310,11 @@ def zip_enc_dec_prompts(
)
->
List
[
ExplicitEncoderDecoderPrompt
[
_T1
,
_T2
]]:
"""
Zip encoder and decoder prompts together into a list of
:class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs
may also be provided; if a dict is passed, the same dictionary will be
used for every encoder/decoder prompt. If an iterable is provided, it will
be zipped with the encoder/decoder prompts.
:class:`ExplicitEncoderDecoderPrompt` instances.
``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
dictionary will be used for every encoder/decoder prompt. If an iterable is
provided, it will be zipped with the encoder/decoder prompts.
"""
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
cast
(
Dict
[
str
,
Any
],
{})
...
...
vllm/inputs/preprocess.py
View file @
0b8bb86b
import
asyncio
from
typing
import
List
,
Optional
from
typing
import
List
,
Mapping
,
Optional
,
Union
from
typing_extensions
import
assert_never
from
vllm.config
import
ModelConfig
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalRegistry
from
vllm.multimodal.processing
import
MultiModalDataDict
,
MultiModalInputsV2
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.transformers_utils.tokenizer_group
import
BaseTokenizerGroup
from
vllm.utils
import
print_warning_once
...
...
@@ -23,11 +25,13 @@ class InputPreprocessor:
self
,
model_config
:
ModelConfig
,
tokenizer
:
Optional
[
BaseTokenizerGroup
],
mm_registry
:
MultiModalRegistry
=
MULTIMODAL_REGISTRY
,
)
->
None
:
super
().
__init__
()
self
.
model_config
=
model_config
self
.
tokenizer
=
tokenizer
self
.
mm_registry
=
mm_registry
def
get_tokenizer_group
(
self
)
->
BaseTokenizerGroup
:
if
self
.
tokenizer
is
None
:
...
...
@@ -198,14 +202,79 @@ class InputPreprocessor:
prompt
=
prompt
,
lora_request
=
lora_request
)
def
_can_process_multimodal
(
self
)
->
bool
:
model_config
=
self
.
model_config
if
not
model_config
.
is_multimodal_model
:
raise
ValueError
(
"Your model does not support multi-modal inputs"
)
# Interim measure so we can handle models that have yet to be
# updated to use the new multi-modal processor
can_process_multimodal
=
self
.
mm_registry
.
has_processor
(
model_config
)
if
not
can_process_multimodal
:
logger
.
info
(
"Your model uses the legacy input pipeline instead of the new "
"multi-modal processor. Please note that the legacy pipeline "
"will be removed in a future release. For more details, see: "
"https://github.com/vllm-project/vllm/issues/10114"
)
return
can_process_multimodal
def
_process_multimodal
(
self
,
prompt
:
Union
[
str
,
List
[
int
]],
mm_data
:
MultiModalDataDict
,
mm_processor_kwargs
:
Optional
[
Mapping
[
str
,
object
]],
lora_request
:
Optional
[
LoRARequest
],
)
->
MultiModalInputsV2
:
"""
Apply the model's multi-modal processor to a multi-modal prompt,
returning the corresponding token IDs and metadata.
"""
tokenizer_group
=
self
.
get_tokenizer_group
()
tokenizer
=
tokenizer_group
.
get_lora_tokenizer
(
lora_request
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
)
if
isinstance
(
prompt
,
list
):
prompt
=
tokenizer
.
decode
(
prompt
)
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
return
mm_processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
async
def
_process_multimodal_async
(
self
,
prompt
:
Union
[
str
,
List
[
int
]],
mm_data
:
MultiModalDataDict
,
mm_processor_kwargs
:
Optional
[
Mapping
[
str
,
object
]],
lora_request
:
Optional
[
LoRARequest
],
)
->
MultiModalInputsV2
:
"""Async version of :meth:`_process_multimodal`."""
tokenizer_group
=
self
.
get_tokenizer_group
()
tokenizer
=
await
tokenizer_group
.
get_lora_tokenizer_async
(
lora_request
)
mm_processor
=
self
.
mm_registry
.
create_processor
(
self
.
model_config
,
tokenizer
)
if
isinstance
(
prompt
,
list
):
logger
.
warning
(
"Passing `multi_modal_data` in TokensPrompt is"
"deprecated and will be removed in a future update"
)
prompt
=
tokenizer
.
decode
(
prompt
)
if
mm_processor_kwargs
is
None
:
mm_processor_kwargs
=
{}
return
mm_processor
.
apply
(
prompt
,
mm_data
,
mm_processor_kwargs
)
def
_prompt_to_llm_inputs
(
self
,
prompt
:
SingletonPrompt
,
request_id
:
str
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
)
->
SingletonInputs
:
'''
Extract the
components of any single encoder or decoder input
prompt.
"""
Extract the
singleton inputs from a
prompt.
Arguments:
...
...
@@ -215,12 +284,8 @@ class InputPreprocessor:
Returns:
* prompt
* prompt_token_ids
* multi_modal_data
* mm_processor_kwargs (request-level input processor/mapper overrides)
'''
* :class:`SingletonInputs` instance
"""
parsed
=
parse_singleton_prompt
(
prompt
)
if
parsed
[
"type"
]
==
"str"
:
...
...
@@ -243,6 +308,14 @@ class InputPreprocessor:
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
():
return
self
.
_process_multimodal
(
prompt_token_ids
,
multi_modal_data
,
mm_processor_kwargs
,
lora_request
=
lora_request
,
)
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
...
...
@@ -253,13 +326,22 @@ class InputPreprocessor:
text_content
=
parsed
[
"content"
]
prompt_text
=
text_content
[
"prompt"
]
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
():
return
self
.
_process_multimodal
(
prompt_text
,
multi_modal_data
,
mm_processor_kwargs
,
lora_request
=
lora_request
,
)
prompt_token_ids
=
self
.
_tokenize_prompt
(
prompt_text
,
request_id
=
request_id
,
lora_request
=
lora_request
,
)
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
return
token_inputs
(
prompt
=
prompt_text
,
...
...
@@ -299,6 +381,14 @@ class InputPreprocessor:
multi_modal_data
=
tokens_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
tokens_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
():
return
await
self
.
_process_multimodal_async
(
prompt_token_ids
,
multi_modal_data
,
mm_processor_kwargs
,
lora_request
=
lora_request
,
)
return
token_inputs
(
prompt_token_ids
=
prompt_token_ids
,
multi_modal_data
=
multi_modal_data
,
...
...
@@ -309,13 +399,22 @@ class InputPreprocessor:
text_content
=
parsed
[
"content"
]
prompt_text
=
text_content
[
"prompt"
]
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
if
multi_modal_data
is
not
None
and
self
.
_can_process_multimodal
():
return
await
self
.
_process_multimodal_async
(
prompt_text
,
multi_modal_data
,
mm_processor_kwargs
,
lora_request
=
lora_request
,
)
prompt_token_ids
=
await
self
.
_tokenize_prompt_async
(
prompt_text
,
request_id
=
request_id
,
lora_request
=
lora_request
,
)
multi_modal_data
=
text_content
.
get
(
"multi_modal_data"
)
mm_processor_kwargs
=
text_content
.
get
(
"mm_processor_kwargs"
)
return
token_inputs
(
prompt
=
prompt_text
,
...
...
@@ -331,7 +430,8 @@ class InputPreprocessor:
encoder_inputs
:
SingletonInputs
,
decoder_inputs
:
Optional
[
SingletonInputs
],
)
->
EncoderDecoderInputs
:
if
encoder_inputs
[
"type"
]
==
"token"
:
if
(
encoder_inputs
[
"type"
]
==
"token"
or
encoder_inputs
[
"type"
]
==
"multimodal"
):
pass
else
:
assert_never
(
encoder_inputs
)
...
...
@@ -340,7 +440,8 @@ class InputPreprocessor:
dec_token_ids
=
self
.
_prepare_decoder_input_ids_for_generation
(
None
)
decoder_inputs
=
token_inputs
(
dec_token_ids
)
elif
decoder_inputs
[
"type"
]
==
"token"
:
elif
(
decoder_inputs
[
"type"
]
==
"token"
or
decoder_inputs
[
"type"
]
==
"multimodal"
):
dec_token_ids
=
self
.
_prepare_decoder_input_ids_for_generation
(
decoder_inputs
[
"prompt_token_ids"
])
decoder_inputs
[
"prompt_token_ids"
]
=
dec_token_ids
...
...
@@ -361,7 +462,7 @@ class InputPreprocessor:
prompt
:
PromptType
,
request_id
:
str
,
)
->
EncoderDecoderInputs
:
'''
"""
For encoder/decoder models only:
Process an input prompt into an :class:`EncoderDecoderInputs` instance.
...
...
@@ -391,8 +492,7 @@ class InputPreprocessor:
Returns:
* :class:`EncoderDecoderInputs` instance
'''
"""
encoder_inputs
:
SingletonInputs
decoder_inputs
:
Optional
[
SingletonInputs
]
...
...
@@ -460,7 +560,8 @@ class InputPreprocessor:
prompt_inputs
:
DecoderOnlyInputs
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
],
)
->
DecoderOnlyInputs
:
if
prompt_inputs
[
"type"
]
==
"token"
:
if
(
prompt_inputs
[
"type"
]
==
"token"
or
prompt_inputs
[
"type"
]
==
"multimodal"
):
prompt_inputs
[
"prompt_token_ids"
]
=
self
.
_apply_prompt_adapter
(
prompt_inputs
[
"prompt_token_ids"
],
prompt_adapter_request
=
prompt_adapter_request
,
...
...
@@ -477,7 +578,7 @@ class InputPreprocessor:
lora_request
:
Optional
[
LoRARequest
]
=
None
,
prompt_adapter_request
:
Optional
[
PromptAdapterRequest
]
=
None
,
)
->
DecoderOnlyInputs
:
'''
"""
For decoder-only models:
Process an input prompt into an :class:`DecoderOnlyInputs` instance.
...
...
@@ -491,7 +592,7 @@ class InputPreprocessor:
Returns:
* :class:`DecoderOnlyInputs` instance
'''
"""
prompt_comps
=
self
.
_prompt_to_llm_inputs
(
prompt
,
...
...
vllm/inputs/registry.py
View file @
0b8bb86b
...
...
@@ -5,14 +5,17 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple,
Optional
,
Protocol
,
Type
,
cast
)
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
typing_extensions
import
TypeVar
from
transformers
import
PretrainedConfig
,
ProcessorMixin
from
typing_extensions
import
TypeVar
,
assert_never
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.processor
import
cached_get_processor
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
(
get_allowed_kwarg_only_overrides
,
print_warning_once
,
resolve_mm_processor_kwargs
)
from
.data
import
ProcessorInputs
from
.data
import
ProcessorInputs
,
SingletonInputs
from
.parse
import
is_encoder_decoder_inputs
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
...
...
@@ -61,6 +64,19 @@ class InputContext:
return
self
.
model_config
.
hf_image_processor_config
@
dataclass
(
frozen
=
True
)
class
InputProcessingContext
(
InputContext
):
tokenizer
:
AnyTokenizer
"""The tokenizer used to tokenize the inputs."""
def
get_hf_processor
(
self
)
->
ProcessorMixin
:
return
cached_get_processor
(
self
.
model_config
.
tokenizer
,
tokenizer
=
self
.
tokenizer
,
# Override the tokenizer with ours
trust_remote_code
=
self
.
model_config
.
trust_remote_code
,
)
N
=
TypeVar
(
"N"
,
bound
=
Type
[
nn
.
Module
])
...
...
@@ -94,7 +110,7 @@ class DummyDataFactory(Protocol):
...
class
_MultiModalCounts
(
UserDict
):
class
_MultiModalCounts
(
UserDict
[
str
,
int
]
):
"""
Wraps `mm_counts` for a more informative error message
when attempting to access a plugin that does not exist.
...
...
@@ -287,6 +303,21 @@ class InputRegistry:
return
self
.
_input_processors_by_model_type
\
.
get
(
model_cls
,
self
.
_default_input_processor
)
def
_ensure_mm_kwargs
(
self
,
inputs
:
SingletonInputs
,
mm_processor_kwargs
:
Dict
[
str
,
Any
],
):
if
inputs
[
"type"
]
==
"token"
:
# In case the input processor for that model fails to set it
if
"mm_processor_kwargs"
not
in
inputs
:
inputs
[
"mm_processor_kwargs"
]
=
mm_processor_kwargs
elif
inputs
[
"type"
]
==
"multimodal"
:
# Be more strict in V2
assert
"mm_kwargs"
in
inputs
else
:
assert_never
(
inputs
[
"type"
])
def
process_input
(
self
,
model_config
:
"ModelConfig"
,
inputs
:
ProcessorInputs
)
->
ProcessorInputs
:
"""
...
...
@@ -312,8 +343,21 @@ class InputRegistry:
processor
,
)
return
processor
(
InputContext
(
model_config
),
inputs
,
**
mm_processor_kwargs
)
processed_inputs
=
processor
(
InputContext
(
model_config
),
inputs
,
**
mm_processor_kwargs
,
)
if
is_encoder_decoder_inputs
(
processed_inputs
):
self
.
_ensure_mm_kwargs
(
processed_inputs
[
"encoder"
],
mm_processor_kwargs
)
self
.
_ensure_mm_kwargs
(
processed_inputs
[
"decoder"
],
mm_processor_kwargs
)
else
:
self
.
_ensure_mm_kwargs
(
processed_inputs
,
mm_processor_kwargs
)
return
processed_inputs
def
create_input_processor
(
self
,
model_config
:
"ModelConfig"
):
"""
...
...
vllm/model_executor/models/chatglm.py
View file @
0b8bb86b
...
...
@@ -30,8 +30,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.glm4_vision_encoder
import
EVA2CLIPModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.
base
import
MultiModalData
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.
inputs
import
MultiModalData
,
MultiModalKwargs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SequenceData
)
...
...
vllm/model_executor/models/fuyu.py
View file @
0b8bb86b
...
...
@@ -32,8 +32,7 @@ from vllm.model_executor.layers.linear import ColumnParallelLinear
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
)
...
...
vllm/model_executor/models/h2ovl.py
View file @
0b8bb86b
...
...
@@ -15,8 +15,7 @@ from transformers import PretrainedConfig
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
InputContext
,
token_inputs
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.utils
import
is_list_of
...
...
vllm/model_executor/models/internvl.py
View file @
0b8bb86b
...
...
@@ -25,8 +25,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.models.intern_vit
import
(
InternVisionModel
,
InternVisionPatchModel
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment