Unverified Commit e0191a95 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[0/N] Rename `MultiModalInputs` to `MultiModalKwargs` (#10040)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent d7edca1d
...@@ -53,7 +53,7 @@ Base Classes ...@@ -53,7 +53,7 @@ Base Classes
.. autodata:: vllm.multimodal.MultiModalDataDict .. autodata:: vllm.multimodal.MultiModalDataDict
.. autoclass:: vllm.multimodal.MultiModalInputs .. autoclass:: vllm.multimodal.MultiModalKwargs
:members: :members:
:show-inheritance: :show-inheritance:
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from PIL.Image import Image from PIL.Image import Image
from vllm.inputs import InputContext, token_inputs from vllm.inputs import InputContext, token_inputs
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from .....conftest import IMAGE_ASSETS from .....conftest import IMAGE_ASSETS
...@@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, ...@@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
# Ensure that we get the appropriately shaped pixel_values # Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively. # for images and image embeddings, respectively.
assert isinstance(mapped_img_data, MultiModalInputs) assert isinstance(mapped_img_data, MultiModalKwargs)
assert "pixel_values" in mapped_img_data assert "pixel_values" in mapped_img_data
assert mapped_img_data["pixel_values"].shape == expected_shape assert mapped_img_data["pixel_values"].shape == expected_shape
......
import torch import torch
from vllm.multimodal.base import MultiModalInputs, NestedTensors from vllm.multimodal.base import MultiModalKwargs, NestedTensors
def assert_nested_tensors_equal(expected: NestedTensors, def assert_nested_tensors_equal(expected: NestedTensors,
...@@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors, ...@@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors,
assert_nested_tensors_equal(expected_item, actual_item) assert_nested_tensors_equal(expected_item, actual_item)
def assert_multimodal_inputs_equal(expected: MultiModalInputs, def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
actual: MultiModalInputs): actual: MultiModalKwargs):
assert set(expected.keys()) == set(actual.keys()) assert set(expected.keys()) == set(actual.keys())
for key in expected: for key in expected:
assert_nested_tensors_equal(expected[key], actual[key]) assert_nested_tensors_equal(expected[key], actual[key])
...@@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs, ...@@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs,
def test_multimodal_input_batch_single_tensor(): def test_multimodal_input_batch_single_tensor():
t = torch.rand([1, 2]) t = torch.rand([1, 2])
result = MultiModalInputs.batch([{"image": t}]) result = MultiModalKwargs.batch([{"image": t}])
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)}) assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
...@@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors(): ...@@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors():
a = torch.rand([1, 1, 2]) a = torch.rand([1, 1, 2])
b = torch.rand([1, 1, 2]) b = torch.rand([1, 1, 2])
c = torch.rand([1, 1, 2]) c = torch.rand([1, 1, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])}) assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
...@@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors(): ...@@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors():
a = torch.rand([1, 2, 2]) a = torch.rand([1, 2, 2])
b = torch.rand([1, 3, 2]) b = torch.rand([1, 3, 2])
c = torch.rand([1, 4, 2]) c = torch.rand([1, 4, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": [a, b, c]}) assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
...@@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors(): ...@@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors():
a = torch.rand([2, 3]) a = torch.rand([2, 3])
b = torch.rand([2, 3]) b = torch.rand([2, 3])
c = torch.rand([2, 3]) c = torch.rand([2, 3])
result = MultiModalInputs.batch([{ result = MultiModalKwargs.batch([{
"image": [a] "image": [a]
}, { }, {
"image": [b] "image": [b]
...@@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists(): ...@@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
a = torch.rand([1, 2, 3]) a = torch.rand([1, 2, 3])
b = torch.rand([1, 2, 3]) b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal( assert_multimodal_inputs_equal(
result, result,
{"image": [torch.stack([a, b]), c.unsqueeze(0)]}) {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
...@@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists(): ...@@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
b = torch.rand([1, 2, 3]) b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3])
d = torch.rand([1, 2, 3]) d = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}]) result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
assert_multimodal_inputs_equal( assert_multimodal_inputs_equal(
result, result,
{"image": torch.stack([torch.stack([a, b]), {"image": torch.stack([torch.stack([a, b]),
...@@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths(): ...@@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
b = torch.rand([1, 3, 3]) b = torch.rand([1, 3, 3])
c = torch.rand([1, 4, 3]) c = torch.rand([1, 4, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]}) assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}]) result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]}) assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
...@@ -30,7 +30,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -30,7 +30,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.base import MultiModalData from vllm.multimodal.base import MultiModalData
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
...@@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv( ...@@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
raise raise
pixel_values = raw_batch_data['images'] pixel_values = raw_batch_data['images']
return MultiModalInputs({'pixel_values': pixel_values}) return MultiModalKwargs({'pixel_values': pixel_values})
def merge_glm_vision_embeddings( def merge_glm_vision_embeddings(
......
...@@ -34,7 +34,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput ...@@ -34,7 +34,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges) consecutive_placeholder_ranges)
...@@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): ...@@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
]) ])
# image has been processed with prompt in input processor # image has been processed with prompt in input processor
return MultiModalInputs({"pixel_values": data}) return MultiModalKwargs({"pixel_values": data})
@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
......
...@@ -16,7 +16,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, ...@@ -16,7 +16,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
token_inputs) token_inputs)
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.utils import is_list_of from vllm.utils import is_list_of
...@@ -324,12 +324,12 @@ class H2OVLInputPipeline(InternVLInputPipeline): ...@@ -324,12 +324,12 @@ class H2OVLInputPipeline(InternVLInputPipeline):
data: object, data: object,
*, *,
max_dynamic_patch: Optional[int] = None, max_dynamic_patch: Optional[int] = None,
) -> MultiModalInputs: ) -> MultiModalKwargs:
# NOTE: Preprocessing for the image data is done in the # NOTE: Preprocessing for the image data is done in the
# 'input_processor' function during actual inference. # 'input_processor' function during actual inference.
if isinstance(data, dict): if isinstance(data, dict):
return MultiModalInputs(data) return MultiModalKwargs(data)
# The section below is only used with dummy data during # The section below is only used with dummy data during
# memory profiling. # memory profiling.
...@@ -347,7 +347,7 @@ class H2OVLInputPipeline(InternVLInputPipeline): ...@@ -347,7 +347,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
pixel_values = [image_pixel_values_mapper(img) for img in data] pixel_values = [image_pixel_values_mapper(img) for img in data]
else: else:
return MultiModalInputs({"image_embeds": data}) return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config model_config = ctx.model_config
tokenizer = cached_get_tokenizer( tokenizer = cached_get_tokenizer(
model_config.tokenizer, model_config.tokenizer,
...@@ -359,7 +359,7 @@ class H2OVLInputPipeline(InternVLInputPipeline): ...@@ -359,7 +359,7 @@ class H2OVLInputPipeline(InternVLInputPipeline):
return_tensors="pt", return_tensors="pt",
)[0] )[0]
return MultiModalInputs({ return MultiModalKwargs({
"pixel_values": pixel_values, "pixel_values": pixel_values,
"image_token_id": image_token_id "image_token_id": image_token_id
}) })
......
...@@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.processor import cached_get_processor
...@@ -127,7 +127,7 @@ def input_mapper_for_idefics3( ...@@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
logger.error("Failed to process image (%s)", data) logger.error("Failed to process image (%s)", data)
raise raise
return MultiModalInputs(batch_data) return MultiModalKwargs(batch_data)
def _resize_output_size(height: int, def _resize_output_size(height: int,
......
...@@ -26,7 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel, ...@@ -26,7 +26,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel,
InternVisionPatchModel) InternVisionPatchModel)
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of from vllm.utils import is_list_of
...@@ -346,7 +346,7 @@ class InternVLInputPipeline: ...@@ -346,7 +346,7 @@ class InternVLInputPipeline:
# we can't stack here because images may have different num_patches # we can't stack here because images may have different num_patches
data = [image_pixel_values_mapper(img) for img in data] data = [image_pixel_values_mapper(img) for img in data]
else: else:
return MultiModalInputs({"image_embeds": data}) return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config model_config = ctx.model_config
tokenizer = cached_get_tokenizer( tokenizer = cached_get_tokenizer(
model_config.tokenizer, model_config.tokenizer,
...@@ -355,7 +355,7 @@ class InternVLInputPipeline: ...@@ -355,7 +355,7 @@ class InternVLInputPipeline:
add_special_tokens=False, add_special_tokens=False,
return_tensors="pt")[0] return_tensors="pt")[0]
return MultiModalInputs({ return MultiModalKwargs({
"pixel_values": data, "pixel_values": data,
"image_token_id": image_token_id "image_token_id": image_token_id
}) })
......
...@@ -52,7 +52,7 @@ from vllm.model_executor.models.qwen2 import Qwen2Model ...@@ -52,7 +52,7 @@ from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.models.utils import LLMWrapper
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
...@@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object): ...@@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
batch_data["slice_start_id"] = data[0]["slice_start_id"] batch_data["slice_start_id"] = data[0]["slice_start_id"]
batch_data["slice_end_id"] = data[0]["slice_end_id"] batch_data["slice_end_id"] = data[0]["slice_end_id"]
return MultiModalInputs(batch_data) return MultiModalKwargs(batch_data)
class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
......
...@@ -1162,7 +1162,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): ...@@ -1162,7 +1162,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
def _parse_and_validate_image_input(self, **kwargs: object): def _parse_and_validate_image_input(self, **kwargs: object):
# tensor with the same shape will be batched together by # tensor with the same shape will be batched together by
# MultiModalInputs.batch, so pixel_values here can be: # MultiModalKwargs.batch, so pixel_values here can be:
# - List[List[torch.Tensor]]: # - List[List[torch.Tensor]]:
# with shape (num_tiles, 3, image_res, image_res) # with shape (num_tiles, 3, image_res, image_res)
# - List[torch.Tensor]: # - List[torch.Tensor]:
......
...@@ -37,7 +37,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ...@@ -37,7 +37,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SequenceData) SequenceData)
...@@ -866,7 +866,7 @@ def image_input_mapper_for_molmo( ...@@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
ctx: InputContext, ctx: InputContext,
data: object, data: object,
): ):
return MultiModalInputs(data) return MultiModalKwargs(data)
def dummy_data_for_molmo(ctx: InputContext, seq_len: int, def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
......
...@@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -30,7 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges) consecutive_placeholder_ranges)
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
...@@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, ...@@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
def input_mapper_for_pixtral(ctx: InputContext, def input_mapper_for_pixtral(ctx: InputContext,
data: object) -> MultiModalInputs: data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalInputs (if any). """Maps the input data to its MultiModalKwargs (if any).
Args: Args:
ctx: Context of the loaded model. ctx: Context of the loaded model.
...@@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext, ...@@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
to pixel_values in .forward() for a visual QWenLMHeadModel model. to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns: Returns:
MultiModalInputs containing the stacked normalized images tensor or MultiModalKwargs containing the stacked normalized images tensor or
image embeddings. image embeddings.
""" """
# Early exit if we have provided an image to a language only Qwen model # Early exit if we have provided an image to a language only Qwen model
...@@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext, ...@@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
dtype=torch.float16) dtype=torch.float16)
images.append(image) images.append(image)
return MultiModalInputs({"images": images}) return MultiModalKwargs({"images": images})
def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
......
...@@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -43,7 +43,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
from vllm.utils import is_list_of from vllm.utils import is_list_of
...@@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext, ...@@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext,
multi_modal_data=multi_modal_data) multi_modal_data=multi_modal_data)
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalInputs (if any). """Maps the input data to its MultiModalKwargs (if any).
Args: Args:
ctx: Context of the loaded model. ctx: Context of the loaded model.
...@@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: ...@@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
to pixel_values in .forward() for a visual QWenLMHeadModel model. to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns: Returns:
MultiModalInputs containing the stacked normalized images tensor or MultiModalKwargs containing the stacked normalized images tensor or
image embeddings. image embeddings.
""" """
# Early exit if we have provided an image to a language only Qwen model # Early exit if we have provided an image to a language only Qwen model
...@@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: ...@@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
logger.warning( logger.warning(
"Images were provided but this model has no visual config; " "Images were provided but this model has no visual config; "
"multimodal inputs will not be forwarded to the model.") "multimodal inputs will not be forwarded to the model.")
return MultiModalInputs() return MultiModalKwargs()
model_config = ctx.model_config model_config = ctx.model_config
tokenizer = cached_get_tokenizer( tokenizer = cached_get_tokenizer(
...@@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: ...@@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
data = [data] data = [data]
transformed_images = [transform(datum) for datum in data] transformed_images = [transform(datum) for datum in data]
pixel_values = torch.stack(transformed_images, dim=0) pixel_values = torch.stack(transformed_images, dim=0)
return MultiModalInputs({"pixel_values": pixel_values}) return MultiModalKwargs({"pixel_values": pixel_values})
def build_normalization_transform(image_size: int) -> transforms.Compose: def build_normalization_transform(image_size: int) -> transforms.Compose:
......
...@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -42,7 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, maybe_remap_kv_scale_name) default_weight_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData from vllm.sequence import IntermediateTensors, SequenceData
...@@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio( ...@@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
def input_mapper_for_qwen2_audio( def input_mapper_for_qwen2_audio(
ctx: InputContext, ctx: InputContext,
multi_modal_data: Union[np.ndarray, List[np.ndarray]], multi_modal_data: Union[np.ndarray, List[np.ndarray]],
) -> MultiModalInputs: ) -> MultiModalKwargs:
"""Input mapper for Qwen2-Audio.""" """Input mapper for Qwen2-Audio."""
if not isinstance(multi_modal_data, list): if not isinstance(multi_modal_data, list):
multi_modal_data = [multi_modal_data] multi_modal_data = [multi_modal_data]
if len(multi_modal_data) == 0: if len(multi_modal_data) == 0:
return MultiModalInputs() return MultiModalKwargs()
processor = cached_get_processor(ctx.model_config.model) processor = cached_get_processor(ctx.model_config.model)
audio_feature_extractor = processor.feature_extractor audio_feature_extractor = processor.feature_extractor
...@@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio( ...@@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
logger.error("Failed to process audio (%s)", multi_modal_data) logger.error("Failed to process audio (%s)", multi_modal_data)
raise raise
return MultiModalInputs(batch_data) return MultiModalKwargs(batch_data)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
......
...@@ -57,7 +57,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ...@@ -57,7 +57,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
MultiModalInputs) MultiModalKwargs)
from vllm.multimodal.base import MultiModalData from vllm.multimodal.base import MultiModalData
from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.utils import cached_get_tokenizer
...@@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl( ...@@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl(
*, *,
min_pixels: Optional[int] = None, min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None, max_pixels: Optional[int] = None,
) -> MultiModalInputs: ) -> MultiModalKwargs:
"""Input mapper for Qwen2-VL.""" """Input mapper for Qwen2-VL."""
if data_type_key == "image" and isinstance(data, dict): if data_type_key == "image" and isinstance(data, dict):
return MultiModalInputs({ return MultiModalKwargs({
"image_embeds": data.get("image_embeds"), "image_embeds": data.get("image_embeds"),
"image_grid_thw": data.get("image_grid_thw"), "image_grid_thw": data.get("image_grid_thw"),
}) })
...@@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl( ...@@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl(
logger.error("Failed to process image (%s)", data) logger.error("Failed to process image (%s)", data)
raise raise
return MultiModalInputs(batch_data) return MultiModalKwargs(batch_data)
image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
......
...@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -24,7 +24,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.model_loader.loader import DefaultModelLoader
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
NestedTensors) NestedTensors)
from vllm.multimodal.utils import (cached_get_tokenizer, from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges, consecutive_placeholder_ranges,
...@@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): ...@@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
data = [data] data = [data]
if len(data) == 0: if len(data) == 0:
return MultiModalInputs() return MultiModalKwargs()
# If the audio inputs are embeddings, no need for preprocessing # If the audio inputs are embeddings, no need for preprocessing
if is_list_of(data, torch.Tensor, check="all"): if is_list_of(data, torch.Tensor, check="all"):
return MultiModalInputs({"audio_embeds": data}) return MultiModalKwargs({"audio_embeds": data})
audio_features = [] audio_features = []
for audio_input in data: for audio_input in data:
...@@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): ...@@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
# Remove the batch dimension because we're wrapping it in a list. # Remove the batch dimension because we're wrapping it in a list.
audio_features.append(single_audio_features.squeeze(0)) audio_features.append(single_audio_features.squeeze(0))
return MultiModalInputs({"audio_features": audio_features}) return MultiModalKwargs({"audio_features": audio_features})
def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
......
from .base import (BatchedTensorInputs, MultiModalDataBuiltins, from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
MultiModalDataDict, MultiModalInputs, MultiModalDataDict, MultiModalKwargs,
MultiModalPlaceholderDict, MultiModalPlaceholderMap, MultiModalPlaceholderDict, MultiModalPlaceholderMap,
MultiModalPlugin, NestedTensors) MultiModalPlugin, NestedTensors)
from .registry import MultiModalRegistry from .registry import MultiModalRegistry
...@@ -17,7 +17,7 @@ __all__ = [ ...@@ -17,7 +17,7 @@ __all__ = [
"BatchedTensorInputs", "BatchedTensorInputs",
"MultiModalDataBuiltins", "MultiModalDataBuiltins",
"MultiModalDataDict", "MultiModalDataDict",
"MultiModalInputs", "MultiModalKwargs",
"MultiModalPlaceholderDict", "MultiModalPlaceholderDict",
"MultiModalPlaceholderMap", "MultiModalPlaceholderMap",
"MultiModalPlugin", "MultiModalPlugin",
...@@ -25,3 +25,18 @@ __all__ = [ ...@@ -25,3 +25,18 @@ __all__ = [
"MULTIMODAL_REGISTRY", "MULTIMODAL_REGISTRY",
"MultiModalRegistry", "MultiModalRegistry",
] ]
def __getattr__(name: str):
import warnings
if name == "MultiModalInputs":
msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
"The original name will take another meaning in an upcoming "
"version.")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
return MultiModalKwargs
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
from vllm.inputs.registry import InputContext from vllm.inputs.registry import InputContext
from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin
class AudioPlugin(MultiModalPlugin): class AudioPlugin(MultiModalPlugin):
...@@ -9,7 +9,7 @@ class AudioPlugin(MultiModalPlugin): ...@@ -9,7 +9,7 @@ class AudioPlugin(MultiModalPlugin):
return "audio" return "audio"
def _default_input_mapper(self, ctx: InputContext, data: object, def _default_input_mapper(self, ctx: InputContext, data: object,
**mm_processor_kwargs) -> MultiModalInputs: **mm_processor_kwargs) -> MultiModalKwargs:
raise NotImplementedError("There is no default audio input mapper") raise NotImplementedError("There is no default audio input mapper")
def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
......
...@@ -30,15 +30,15 @@ Uses a list instead of a tensor if the dimensions of each element do not match. ...@@ -30,15 +30,15 @@ Uses a list instead of a tensor if the dimensions of each element do not match.
BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
""" """
A dictionary containing nested tensors which have been batched via A dictionary containing nested tensors which have been batched via
:meth:`MultiModalInputs.batch`. :meth:`MultiModalKwargs.batch`.
""" """
class _MultiModalInputsBase(UserDict[str, NestedTensors]): class _MultiModalKwargsBase(UserDict[str, NestedTensors]):
pass pass
class MultiModalInputs(_MultiModalInputsBase): class MultiModalKwargs(_MultiModalKwargsBase):
""" """
A dictionary that represents the keyword arguments to A dictionary that represents the keyword arguments to
:meth:`~torch.nn.Module.forward`. :meth:`~torch.nn.Module.forward`.
...@@ -58,7 +58,7 @@ class MultiModalInputs(_MultiModalInputsBase): ...@@ -58,7 +58,7 @@ class MultiModalInputs(_MultiModalInputsBase):
if isinstance(nested_tensors, (int, float)): if isinstance(nested_tensors, (int, float)):
return torch.tensor(nested_tensors) return torch.tensor(nested_tensors)
stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors] stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
if not is_list_of(stacked, torch.Tensor, check="all"): if not is_list_of(stacked, torch.Tensor, check="all"):
# Only tensors (not lists) can be stacked. # Only tensors (not lists) can be stacked.
return stacked return stacked
...@@ -71,7 +71,7 @@ class MultiModalInputs(_MultiModalInputsBase): ...@@ -71,7 +71,7 @@ class MultiModalInputs(_MultiModalInputsBase):
return torch.stack(tensors_) return torch.stack(tensors_)
@staticmethod @staticmethod
def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
""" """
Batch multiple inputs together into a dictionary. Batch multiple inputs together into a dictionary.
...@@ -95,7 +95,7 @@ class MultiModalInputs(_MultiModalInputsBase): ...@@ -95,7 +95,7 @@ class MultiModalInputs(_MultiModalInputsBase):
item_lists[k].append(v) item_lists[k].append(v)
return { return {
k: MultiModalInputs._try_stack(item_list) k: MultiModalKwargs._try_stack(item_list)
for k, item_list in item_lists.items() for k, item_list in item_lists.items()
} }
...@@ -177,7 +177,7 @@ A dictionary containing placeholder ranges. ...@@ -177,7 +177,7 @@ A dictionary containing placeholder ranges.
""" """
MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
MultiModalInputs] MultiModalKwargs]
""" """
Return a dictionary to be passed as keyword arguments to Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
...@@ -226,7 +226,7 @@ class MultiModalPlugin(ABC): ...@@ -226,7 +226,7 @@ class MultiModalPlugin(ABC):
ctx: InputContext, ctx: InputContext,
data: MultiModalData[object], data: MultiModalData[object],
**mm_processor_kwargs, **mm_processor_kwargs,
) -> MultiModalInputs: ) -> MultiModalKwargs:
""" """
Return a dictionary to be passed as keyword arguments to Return a dictionary to be passed as keyword arguments to
:meth:`~torch.nn.Module.forward`. This is similar in concept to :meth:`~torch.nn.Module.forward`. This is similar in concept to
...@@ -275,7 +275,7 @@ class MultiModalPlugin(ABC): ...@@ -275,7 +275,7 @@ class MultiModalPlugin(ABC):
model_config: "ModelConfig", model_config: "ModelConfig",
data: MultiModalData[object], data: MultiModalData[object],
mm_processor_kwargs: Dict[str, Any], mm_processor_kwargs: Dict[str, Any],
) -> MultiModalInputs: ) -> MultiModalKwargs:
""" """
Transform the data into a dictionary of model inputs using the Transform the data into a dictionary of model inputs using the
input mapper registered for that model. input mapper registered for that model.
...@@ -585,3 +585,18 @@ class MultiModalPlaceholderMap: ...@@ -585,3 +585,18 @@ class MultiModalPlaceholderMap:
return MultiModalPlaceholderMap.IndexMap(src=src_indices, return MultiModalPlaceholderMap.IndexMap(src=src_indices,
dest=dest_indices) dest=dest_indices)
def __getattr__(name: str):
import warnings
if name == "MultiModalInputs":
msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
"The original name will take another meaning in an upcoming "
"version.")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
return MultiModalKwargs
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
...@@ -10,7 +10,7 @@ from vllm.logger import init_logger ...@@ -10,7 +10,7 @@ from vllm.logger import init_logger
from vllm.transformers_utils.processor import get_image_processor from vllm.transformers_utils.processor import get_image_processor
from vllm.utils import is_list_of from vllm.utils import is_list_of
from .base import MultiModalData, MultiModalInputs, MultiModalPlugin from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig from vllm.config import ModelConfig
...@@ -43,12 +43,12 @@ class ImagePlugin(MultiModalPlugin): ...@@ -43,12 +43,12 @@ class ImagePlugin(MultiModalPlugin):
ctx: InputContext, ctx: InputContext,
data: MultiModalData[object], data: MultiModalData[object],
**mm_processor_kwargs, **mm_processor_kwargs,
) -> MultiModalInputs: ) -> MultiModalKwargs:
model_config = ctx.model_config model_config = ctx.model_config
# Processed by input processor # Processed by input processor
if isinstance(data, BatchFeature): if isinstance(data, BatchFeature):
return MultiModalInputs(data.data) return MultiModalKwargs(data.data)
# PIL image # PIL image
if isinstance(data, Image.Image) or is_list_of(data, Image.Image): if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
...@@ -78,11 +78,11 @@ class ImagePlugin(MultiModalPlugin): ...@@ -78,11 +78,11 @@ class ImagePlugin(MultiModalPlugin):
type(image_processor).__name__) type(image_processor).__name__)
raise raise
return MultiModalInputs(batch_data) return MultiModalKwargs(batch_data)
# Image embedding # Image embedding
elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
return MultiModalInputs({"image_embeds": data}) return MultiModalKwargs({"image_embeds": data})
raise TypeError(f"Invalid image type: {type(data)}") raise TypeError(f"Invalid image type: {type(data)}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment