Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e0191a95
Unverified
Commit
e0191a95
authored
Nov 09, 2024
by
Cyrus Leung
Committed by
GitHub
Nov 09, 2024
Browse files
[0/N] Rename `MultiModalInputs` to `MultiModalKwargs` (#10040)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
d7edca1d
Changes
32
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
104 additions
and
74 deletions
+104
-74
docs/source/design/multimodal/multimodal_index.rst
docs/source/design/multimodal/multimodal_index.rst
+1
-1
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
...der_only/vision_language/mm_processor_kwargs/test_qwen.py
+2
-2
tests/multimodal/test_base.py
tests/multimodal/test_base.py
+11
-11
vllm/model_executor/models/chatglm.py
vllm/model_executor/models/chatglm.py
+2
-2
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+2
-2
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+5
-5
vllm/model_executor/models/idefics3.py
vllm/model_executor/models/idefics3.py
+2
-2
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+3
-3
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+2
-2
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+1
-1
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+2
-2
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+5
-5
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+6
-6
vllm/model_executor/models/qwen2_audio.py
vllm/model_executor/models/qwen2_audio.py
+4
-4
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+4
-4
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+4
-4
vllm/multimodal/__init__.py
vllm/multimodal/__init__.py
+17
-2
vllm/multimodal/audio.py
vllm/multimodal/audio.py
+2
-2
vllm/multimodal/base.py
vllm/multimodal/base.py
+24
-9
vllm/multimodal/image.py
vllm/multimodal/image.py
+5
-5
No files found.
docs/source/design/multimodal/multimodal_index.rst
View file @
e0191a95
...
...
@@ -53,7 +53,7 @@ Base Classes
.. autodata:: vllm.multimodal.MultiModalDataDict
.. autoclass:: vllm.multimodal.MultiModal
Input
s
.. autoclass:: vllm.multimodal.MultiModal
Kwarg
s
:members:
:show-inheritance:
...
...
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
View file @
e0191a95
...
...
@@ -6,7 +6,7 @@ import torch
from
PIL.Image
import
Image
from
vllm.inputs
import
InputContext
,
token_inputs
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
.....conftest
import
IMAGE_ASSETS
...
...
@@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
mapped_img_data
=
input_mapper_for_qwen
(
qwen_vl_context
,
img_data
)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert
isinstance
(
mapped_img_data
,
MultiModal
Input
s
)
assert
isinstance
(
mapped_img_data
,
MultiModal
Kwarg
s
)
assert
"pixel_values"
in
mapped_img_data
assert
mapped_img_data
[
"pixel_values"
].
shape
==
expected_shape
...
...
tests/multimodal/test_base.py
View file @
e0191a95
import
torch
from
vllm.multimodal.base
import
MultiModal
Input
s
,
NestedTensors
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
,
NestedTensors
def
assert_nested_tensors_equal
(
expected
:
NestedTensors
,
...
...
@@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors,
assert_nested_tensors_equal
(
expected_item
,
actual_item
)
def
assert_multimodal_inputs_equal
(
expected
:
MultiModal
Input
s
,
actual
:
MultiModal
Input
s
):
def
assert_multimodal_inputs_equal
(
expected
:
MultiModal
Kwarg
s
,
actual
:
MultiModal
Kwarg
s
):
assert
set
(
expected
.
keys
())
==
set
(
actual
.
keys
())
for
key
in
expected
:
assert_nested_tensors_equal
(
expected
[
key
],
actual
[
key
])
...
...
@@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs,
def
test_multimodal_input_batch_single_tensor
():
t
=
torch
.
rand
([
1
,
2
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
t
}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
t
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
t
.
unsqueeze
(
0
)})
...
...
@@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors():
a
=
torch
.
rand
([
1
,
1
,
2
])
b
=
torch
.
rand
([
1
,
1
,
2
])
c
=
torch
.
rand
([
1
,
1
,
2
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
a
,
b
,
c
])})
...
...
@@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors():
a
=
torch
.
rand
([
1
,
2
,
2
])
b
=
torch
.
rand
([
1
,
3
,
2
])
c
=
torch
.
rand
([
1
,
4
,
2
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
a
},
{
"image"
:
b
},
{
"image"
:
c
}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
,
b
,
c
]})
...
...
@@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors():
a
=
torch
.
rand
([
2
,
3
])
b
=
torch
.
rand
([
2
,
3
])
c
=
torch
.
rand
([
2
,
3
])
result
=
MultiModal
Input
s
.
batch
([{
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
[
a
]
},
{
"image"
:
[
b
]
...
...
@@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
a
=
torch
.
rand
([
1
,
2
,
3
])
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
torch
.
stack
([
a
,
b
]),
c
.
unsqueeze
(
0
)]})
...
...
@@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
b
=
torch
.
rand
([
1
,
2
,
3
])
c
=
torch
.
rand
([
1
,
2
,
3
])
d
=
torch
.
rand
([
1
,
2
,
3
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
,
d
]}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
,
d
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
torch
.
stack
([
torch
.
stack
([
a
,
b
]),
...
...
@@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
b
=
torch
.
rand
([
1
,
3
,
3
])
c
=
torch
.
rand
([
1
,
4
,
3
])
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
[
a
,
b
]},
{
"image"
:
[
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[[
a
,
b
],
c
.
unsqueeze
(
0
)]})
result
=
MultiModal
Input
s
.
batch
([{
"image"
:
[
a
]},
{
"image"
:
[
b
,
c
]}])
result
=
MultiModal
Kwarg
s
.
batch
([{
"image"
:
[
a
]},
{
"image"
:
[
b
,
c
]}])
assert_multimodal_inputs_equal
(
result
,
{
"image"
:
[
a
.
unsqueeze
(
0
),
[
b
,
c
]]})
vllm/model_executor/models/chatglm.py
View file @
e0191a95
...
...
@@ -30,7 +30,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.glm4_vision_encoder
import
EVA2CLIPModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Input
s
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Kwarg
s
from
vllm.multimodal.base
import
MultiModalData
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
...
...
@@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
raise
pixel_values
=
raw_batch_data
[
'images'
]
return
MultiModal
Input
s
({
'pixel_values'
:
pixel_values
})
return
MultiModal
Kwarg
s
({
'pixel_values'
:
pixel_values
})
def
merge_glm_vision_embeddings
(
...
...
vllm/model_executor/models/fuyu.py
View file @
e0191a95
...
...
@@ -34,7 +34,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from
vllm.model_executor.models.persimmon
import
PersimmonForCausalLM
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
)
...
...
@@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
])
# image has been processed with prompt in input processor
return
MultiModal
Input
s
({
"pixel_values"
:
data
})
return
MultiModal
Kwarg
s
({
"pixel_values"
:
data
})
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
(
input_mapper_for_fuyu
)
...
...
vllm/model_executor/models/h2ovl.py
View file @
e0191a95
...
...
@@ -16,7 +16,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
token_inputs
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.utils
import
is_list_of
...
...
@@ -324,12 +324,12 @@ class H2OVLInputPipeline(InternVLInputPipeline):
data
:
object
,
*
,
max_dynamic_patch
:
Optional
[
int
]
=
None
,
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
# NOTE: Preprocessing for the image data is done in the
# 'input_processor' function during actual inference.
if
isinstance
(
data
,
dict
):
return
MultiModal
Input
s
(
data
)
return
MultiModal
Kwarg
s
(
data
)
# The section below is only used with dummy data during
# memory profiling.
...
...
@@ -347,7 +347,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
pixel_values
=
[
image_pixel_values_mapper
(
img
)
for
img
in
data
]
else
:
return
MultiModal
Input
s
({
"image_embeds"
:
data
})
return
MultiModal
Kwarg
s
({
"image_embeds"
:
data
})
model_config
=
ctx
.
model_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
...
...
@@ -359,7 +359,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
return_tensors
=
"pt"
,
)[
0
]
return
MultiModal
Input
s
({
return
MultiModal
Kwarg
s
({
"pixel_values"
:
pixel_values
,
"image_token_id"
:
image_token_id
})
...
...
vllm/model_executor/models/idefics3.py
View file @
e0191a95
...
...
@@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Input
s
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Kwarg
s
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.transformers_utils.processor
import
cached_get_processor
...
...
@@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModal
Input
s
(
batch_data
)
return
MultiModal
Kwarg
s
(
batch_data
)
def
_resize_output_size
(
height
:
int
,
...
...
vllm/model_executor/models/internvl.py
View file @
e0191a95
...
...
@@ -26,7 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
...
...
@@ -346,7 +346,7 @@ class InternVLInputPipeline:
# we can't stack here because images may have different num_patches
data
=
[
image_pixel_values_mapper
(
img
)
for
img
in
data
]
else
:
return
MultiModal
Input
s
({
"image_embeds"
:
data
})
return
MultiModal
Kwarg
s
({
"image_embeds"
:
data
})
model_config
=
ctx
.
model_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
...
...
@@ -355,7 +355,7 @@ class InternVLInputPipeline:
add_special_tokens
=
False
,
return_tensors
=
"pt"
)[
0
]
return
MultiModal
Input
s
({
return
MultiModal
Kwarg
s
({
"pixel_values"
:
data
,
"image_token_id"
:
image_token_id
})
...
...
vllm/model_executor/models/minicpmv.py
View file @
e0191a95
...
...
@@ -52,7 +52,7 @@ from vllm.model_executor.models.qwen2 import Qwen2Model
from
vllm.model_executor.models.utils
import
LLMWrapper
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
...
...
@@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
batch_data
[
"slice_start_id"
]
=
data
[
0
][
"slice_start_id"
]
batch_data
[
"slice_end_id"
]
=
data
[
0
][
"slice_end_id"
]
return
MultiModal
Input
s
(
batch_data
)
return
MultiModal
Kwarg
s
(
batch_data
)
class
MiniCPMVBaseModel
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
...
...
vllm/model_executor/models/mllama.py
View file @
e0191a95
...
...
@@ -1162,7 +1162,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
):
# tensor with the same shape will be batched together by
# MultiModal
Input
s.batch, so pixel_values here can be:
# MultiModal
Kwarg
s.batch, so pixel_values here can be:
# - List[List[torch.Tensor]]:
# with shape (num_tiles, 3, image_res, image_res)
# - List[torch.Tensor]:
...
...
vllm/model_executor/models/molmo.py
View file @
e0191a95
...
...
@@ -37,7 +37,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Input
s
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SequenceData
)
...
...
@@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
ctx
:
InputContext
,
data
:
object
,
):
return
MultiModal
Input
s
(
data
)
return
MultiModal
Kwarg
s
(
data
)
def
dummy_data_for_molmo
(
ctx
:
InputContext
,
seq_len
:
int
,
...
...
vllm/model_executor/models/pixtral.py
View file @
e0191a95
...
...
@@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.models.utils
import
merge_multimodal_embeddings
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
)
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
...
...
@@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
def
input_mapper_for_pixtral
(
ctx
:
InputContext
,
data
:
object
)
->
MultiModal
Input
s
:
"""Maps the input data to its MultiModal
Input
s (if any).
data
:
object
)
->
MultiModal
Kwarg
s
:
"""Maps the input data to its MultiModal
Kwarg
s (if any).
Args:
ctx: Context of the loaded model.
...
...
@@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModal
Input
s containing the stacked normalized images tensor or
MultiModal
Kwarg
s containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
...
...
@@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
dtype
=
torch
.
float16
)
images
.
append
(
image
)
return
MultiModal
Input
s
({
"images"
:
images
})
return
MultiModal
Kwarg
s
({
"images"
:
images
})
def
input_processor_for_pixtral
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
):
...
...
vllm/model_executor/models/qwen.py
View file @
e0191a95
...
...
@@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModal
Input
s
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.utils
import
is_list_of
...
...
@@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext,
multi_modal_data
=
multi_modal_data
)
def
input_mapper_for_qwen
(
ctx
:
InputContext
,
data
:
object
)
->
MultiModal
Input
s
:
"""Maps the input data to its MultiModal
Input
s (if any).
def
input_mapper_for_qwen
(
ctx
:
InputContext
,
data
:
object
)
->
MultiModal
Kwarg
s
:
"""Maps the input data to its MultiModal
Kwarg
s (if any).
Args:
ctx: Context of the loaded model.
...
...
@@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModal
Input
s containing the stacked normalized images tensor or
MultiModal
Kwarg
s containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
...
...
@@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
logger
.
warning
(
"Images were provided but this model has no visual config; "
"multimodal inputs will not be forwarded to the model."
)
return
MultiModal
Input
s
()
return
MultiModal
Kwarg
s
()
model_config
=
ctx
.
model_config
tokenizer
=
cached_get_tokenizer
(
...
...
@@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
data
=
[
data
]
transformed_images
=
[
transform
(
datum
)
for
datum
in
data
]
pixel_values
=
torch
.
stack
(
transformed_images
,
dim
=
0
)
return
MultiModal
Input
s
({
"pixel_values"
:
pixel_values
})
return
MultiModal
Kwarg
s
({
"pixel_values"
:
pixel_values
})
def
build_normalization_transform
(
image_size
:
int
)
->
transforms
.
Compose
:
...
...
vllm/model_executor/models/qwen2_audio.py
View file @
e0191a95
...
...
@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Input
s
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModal
Kwarg
s
from
vllm.multimodal.utils
import
consecutive_placeholder_ranges
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
...
...
@@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
def
input_mapper_for_qwen2_audio
(
ctx
:
InputContext
,
multi_modal_data
:
Union
[
np
.
ndarray
,
List
[
np
.
ndarray
]],
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
"""Input mapper for Qwen2-Audio."""
if
not
isinstance
(
multi_modal_data
,
list
):
multi_modal_data
=
[
multi_modal_data
]
if
len
(
multi_modal_data
)
==
0
:
return
MultiModal
Input
s
()
return
MultiModal
Kwarg
s
()
processor
=
cached_get_processor
(
ctx
.
model_config
.
model
)
audio_feature_extractor
=
processor
.
feature_extractor
...
...
@@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
logger
.
error
(
"Failed to process audio (%s)"
,
multi_modal_data
)
raise
return
MultiModal
Input
s
(
batch_data
)
return
MultiModal
Kwarg
s
(
batch_data
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_qwen2_audio
)
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
e0191a95
...
...
@@ -57,7 +57,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModalDataDict
,
MultiModal
Input
s
)
MultiModal
Kwarg
s
)
from
vllm.multimodal.base
import
MultiModalData
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.utils
import
cached_get_tokenizer
...
...
@@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl(
*
,
min_pixels
:
Optional
[
int
]
=
None
,
max_pixels
:
Optional
[
int
]
=
None
,
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
"""Input mapper for Qwen2-VL."""
if
data_type_key
==
"image"
and
isinstance
(
data
,
dict
):
return
MultiModal
Input
s
({
return
MultiModal
Kwarg
s
({
"image_embeds"
:
data
.
get
(
"image_embeds"
),
"image_grid_thw"
:
data
.
get
(
"image_grid_thw"
),
})
...
...
@@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl(
logger
.
error
(
"Failed to process image (%s)"
,
data
)
raise
return
MultiModal
Input
s
(
batch_data
)
return
MultiModal
Kwarg
s
(
batch_data
)
image_input_mapper_for_qwen2_vl
=
partial
(
mm_input_mapper_for_qwen2_vl
,
...
...
vllm/model_executor/models/ultravox.py
View file @
e0191a95
...
...
@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.loader
import
DefaultModelLoader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModal
Input
s
,
from
vllm.multimodal
import
(
MULTIMODAL_REGISTRY
,
MultiModal
Kwarg
s
,
NestedTensors
)
from
vllm.multimodal.utils
import
(
cached_get_tokenizer
,
consecutive_placeholder_ranges
,
...
...
@@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
data
=
[
data
]
if
len
(
data
)
==
0
:
return
MultiModal
Input
s
()
return
MultiModal
Kwarg
s
()
# If the audio inputs are embeddings, no need for preprocessing
if
is_list_of
(
data
,
torch
.
Tensor
,
check
=
"all"
):
return
MultiModal
Input
s
({
"audio_embeds"
:
data
})
return
MultiModal
Kwarg
s
({
"audio_embeds"
:
data
})
audio_features
=
[]
for
audio_input
in
data
:
...
...
@@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
# Remove the batch dimension because we're wrapping it in a list.
audio_features
.
append
(
single_audio_features
.
squeeze
(
0
))
return
MultiModal
Input
s
({
"audio_features"
:
audio_features
})
return
MultiModal
Kwarg
s
({
"audio_features"
:
audio_features
})
def
input_processor_for_ultravox
(
ctx
:
InputContext
,
inputs
:
DecoderOnlyInputs
):
...
...
vllm/multimodal/__init__.py
View file @
e0191a95
from
.base
import
(
BatchedTensorInputs
,
MultiModalDataBuiltins
,
MultiModalDataDict
,
MultiModal
Input
s
,
MultiModalDataDict
,
MultiModal
Kwarg
s
,
MultiModalPlaceholderDict
,
MultiModalPlaceholderMap
,
MultiModalPlugin
,
NestedTensors
)
from
.registry
import
MultiModalRegistry
...
...
@@ -17,7 +17,7 @@ __all__ = [
"BatchedTensorInputs"
,
"MultiModalDataBuiltins"
,
"MultiModalDataDict"
,
"MultiModal
Input
s"
,
"MultiModal
Kwarg
s"
,
"MultiModalPlaceholderDict"
,
"MultiModalPlaceholderMap"
,
"MultiModalPlugin"
,
...
...
@@ -25,3 +25,18 @@ __all__ = [
"MULTIMODAL_REGISTRY"
,
"MultiModalRegistry"
,
]
def
__getattr__
(
name
:
str
):
import
warnings
if
name
==
"MultiModalInputs"
:
msg
=
(
"MultiModalInputs has been renamed to MultiModalKwargs. "
"The original name will take another meaning in an upcoming "
"version."
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
return
MultiModalKwargs
raise
AttributeError
(
f
"module
{
__name__
!
r
}
has no attribute
{
name
!
r
}
"
)
vllm/multimodal/audio.py
View file @
e0191a95
from
vllm.inputs.registry
import
InputContext
from
vllm.multimodal.base
import
MultiModal
Input
s
,
MultiModalPlugin
from
vllm.multimodal.base
import
MultiModal
Kwarg
s
,
MultiModalPlugin
class
AudioPlugin
(
MultiModalPlugin
):
...
...
@@ -9,7 +9,7 @@ class AudioPlugin(MultiModalPlugin):
return
"audio"
def
_default_input_mapper
(
self
,
ctx
:
InputContext
,
data
:
object
,
**
mm_processor_kwargs
)
->
MultiModal
Input
s
:
**
mm_processor_kwargs
)
->
MultiModal
Kwarg
s
:
raise
NotImplementedError
(
"There is no default audio input mapper"
)
def
_default_max_multimodal_tokens
(
self
,
ctx
:
InputContext
)
->
int
:
...
...
vllm/multimodal/base.py
View file @
e0191a95
...
...
@@ -30,15 +30,15 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
BatchedTensorInputs
:
TypeAlias
=
Dict
[
str
,
NestedTensors
]
"""
A dictionary containing nested tensors which have been batched via
:meth:`MultiModal
Input
s.batch`.
:meth:`MultiModal
Kwarg
s.batch`.
"""
class
_MultiModal
Input
sBase
(
UserDict
[
str
,
NestedTensors
]):
class
_MultiModal
Kwarg
sBase
(
UserDict
[
str
,
NestedTensors
]):
pass
class
MultiModal
Input
s
(
_MultiModal
Input
sBase
):
class
MultiModal
Kwarg
s
(
_MultiModal
Kwarg
sBase
):
"""
A dictionary that represents the keyword arguments to
:meth:`~torch.nn.Module.forward`.
...
...
@@ -58,7 +58,7 @@ class MultiModalInputs(_MultiModalInputsBase):
if
isinstance
(
nested_tensors
,
(
int
,
float
)):
return
torch
.
tensor
(
nested_tensors
)
stacked
=
[
MultiModal
Input
s
.
_try_stack
(
t
)
for
t
in
nested_tensors
]
stacked
=
[
MultiModal
Kwarg
s
.
_try_stack
(
t
)
for
t
in
nested_tensors
]
if
not
is_list_of
(
stacked
,
torch
.
Tensor
,
check
=
"all"
):
# Only tensors (not lists) can be stacked.
return
stacked
...
...
@@ -71,7 +71,7 @@ class MultiModalInputs(_MultiModalInputsBase):
return
torch
.
stack
(
tensors_
)
@
staticmethod
def
batch
(
inputs_list
:
List
[
"MultiModal
Input
s"
])
->
BatchedTensorInputs
:
def
batch
(
inputs_list
:
List
[
"MultiModal
Kwarg
s"
])
->
BatchedTensorInputs
:
"""
Batch multiple inputs together into a dictionary.
...
...
@@ -95,7 +95,7 @@ class MultiModalInputs(_MultiModalInputsBase):
item_lists
[
k
].
append
(
v
)
return
{
k
:
MultiModal
Input
s
.
_try_stack
(
item_list
)
k
:
MultiModal
Kwarg
s
.
_try_stack
(
item_list
)
for
k
,
item_list
in
item_lists
.
items
()
}
...
...
@@ -177,7 +177,7 @@ A dictionary containing placeholder ranges.
"""
MultiModalInputMapper
=
Callable
[[
InputContext
,
MultiModalData
[
object
]],
MultiModal
Input
s
]
MultiModal
Kwarg
s
]
"""
Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
...
...
@@ -226,7 +226,7 @@ class MultiModalPlugin(ABC):
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
**
mm_processor_kwargs
,
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
"""
Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to
...
...
@@ -275,7 +275,7 @@ class MultiModalPlugin(ABC):
model_config
:
"ModelConfig"
,
data
:
MultiModalData
[
object
],
mm_processor_kwargs
:
Dict
[
str
,
Any
],
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
"""
Transform the data into a dictionary of model inputs using the
input mapper registered for that model.
...
...
@@ -585,3 +585,18 @@ class MultiModalPlaceholderMap:
return
MultiModalPlaceholderMap
.
IndexMap
(
src
=
src_indices
,
dest
=
dest_indices
)
def
__getattr__
(
name
:
str
):
import
warnings
if
name
==
"MultiModalInputs"
:
msg
=
(
"MultiModalInputs has been renamed to MultiModalKwargs. "
"The original name will take another meaning in an upcoming "
"version."
)
warnings
.
warn
(
DeprecationWarning
(
msg
),
stacklevel
=
2
)
return
MultiModalKwargs
raise
AttributeError
(
f
"module
{
__name__
!
r
}
has no attribute
{
name
!
r
}
"
)
vllm/multimodal/image.py
View file @
e0191a95
...
...
@@ -10,7 +10,7 @@ from vllm.logger import init_logger
from
vllm.transformers_utils.processor
import
get_image_processor
from
vllm.utils
import
is_list_of
from
.base
import
MultiModalData
,
MultiModal
Input
s
,
MultiModalPlugin
from
.base
import
MultiModalData
,
MultiModal
Kwarg
s
,
MultiModalPlugin
if
TYPE_CHECKING
:
from
vllm.config
import
ModelConfig
...
...
@@ -43,12 +43,12 @@ class ImagePlugin(MultiModalPlugin):
ctx
:
InputContext
,
data
:
MultiModalData
[
object
],
**
mm_processor_kwargs
,
)
->
MultiModal
Input
s
:
)
->
MultiModal
Kwarg
s
:
model_config
=
ctx
.
model_config
# Processed by input processor
if
isinstance
(
data
,
BatchFeature
):
return
MultiModal
Input
s
(
data
.
data
)
return
MultiModal
Kwarg
s
(
data
.
data
)
# PIL image
if
isinstance
(
data
,
Image
.
Image
)
or
is_list_of
(
data
,
Image
.
Image
):
...
...
@@ -78,11 +78,11 @@ class ImagePlugin(MultiModalPlugin):
type
(
image_processor
).
__name__
)
raise
return
MultiModal
Input
s
(
batch_data
)
return
MultiModal
Kwarg
s
(
batch_data
)
# Image embedding
elif
isinstance
(
data
,
torch
.
Tensor
)
or
is_list_of
(
data
,
torch
.
Tensor
):
return
MultiModal
Input
s
({
"image_embeds"
:
data
})
return
MultiModal
Kwarg
s
({
"image_embeds"
:
data
})
raise
TypeError
(
f
"Invalid image type:
{
type
(
data
)
}
"
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment