[Core] Support image processor (#4197)

7a64d24a · Cyrus Leung · GitHub · dfbe60dc · 7a64d24a · 7a64d24a
Unverified Commit 7a64d24a authored Jun 03, 2024 by Cyrus Leung Committed by GitHub Jun 02, 2024
9 changed files
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
+from .base import MultiModalData, MultiModalPlugin
+from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+__all__ = [
+    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry"
+]
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
+                    TypeVar)
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+if TYPE_CHECKING:
+    import torch
+    from torch import nn
+logger = init_logger(__name__)
+class MultiModalData:
+    """
+    Base class that contains multi-modal data.
+    To add a new modality, add a new file under ``multimodal`` directory.
+    In this new file, subclass :class:`~MultiModalData` and
+    :class:`~MultiModalPlugin`.
+    Finally, register the new plugin to
+    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+    This enables models to call :meth:`MultiModalRegistry.register_input` for
+    the new modality.
+    """
+    pass
+D = TypeVar("D", bound=MultiModalData)
+N = TypeVar("N", bound=Type["nn.Module"])
+MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
+                                    Dict[str, "torch.Tensor"]]
+"""Return a dictionary to be passed as keyword arguments to
+:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+and processors in HuggingFace Transformers."""
+class MultiModalPlugin(ABC, Generic[D]):
+    """
+    Base class that defines data processing logic for a specific modality.
+    In particular, we adopt a registry pattern to dispatch data processing
+    according to the model being used (considering that different models may
+    process the same data differently). This registry is in turn used by
+    :class:`~MultiModalRegistry` which acts at a higher level
+    (i.e., the modality of the data).
+    """
+    @classmethod
+    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+        return get_model_architecture(model_config)[0]
+    def __init__(self) -> None:
+        self._input_processors: Dict[Type["nn.Module"],
+                                     MultiModalInputProcessor[D]] = {}
+    @abstractmethod
+    def get_data_type(self) -> Type[D]:
+        """
+        Get the modality (subclass of :class:`~MultiModalData`) served by
+        this plugin.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def _default_input_processor(
+            self, data: D, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+        """Return a dictionary to be passed as keyword arguments to
+        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        tokenizers and processors in HuggingFace Transformers.
+        """
+        raise NotImplementedError
+    def register_input_processor(self,
+                                 processor: Optional[
+                                     MultiModalInputProcessor[D]] = None):
+        """
+        Register an input processor to a model class.
+        When the model receives input data that matches the modality served by
+        this plugin (see :meth:`get_data_type`), the provided input processor is
+        applied to preprocess the data. If `None` is provided, then the default
+        input processor is applied instead.
+        """
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors:
+                logger.warning(
+                    "Model class %s already has an input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+            self._input_processors[model_cls] = processor \
+                or self._default_input_processor
+            return model_cls
+        return wrapper
+    def process_input(
+            self, data: D, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+        """
+        Apply an input processor to a :class:`~MultiModalData` instance passed
+        to the model.
+        The model is identified by ``model_config``. ``vlm_config`` is
+        for compatibility purposes and may be merged into ``model_config``
+        in the near future.
+        """
+        model_cls = self.get_model_cls(model_config)
+        processor = self._input_processors.get(model_cls)
+        if processor is None:
+            raise KeyError(f"No input processor in {self} is registered for "
+                           f"model class {model_cls.__name__}.")
+        return processor(data, model_config, vlm_config)
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
+from typing import Dict, Tuple, Type, Union
+import torch
+from PIL import Image
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+from vllm.sequence import SequenceData
+from vllm.transformers_utils.image_processor import cached_get_image_processor
+from .base import MultiModalData, MultiModalPlugin
+logger = init_logger(__name__)
+def _get_dummy_seq_data(seq_len: int,
+                        vlm_config: VisionLanguageConfig) -> SequenceData:
+    # NOTE: We assume that <image> token is repeated `image_feature_size` times
+    # and then concatenated with the text prompt
+    # TODO: Enable other ways of inserting the image into the prompt
+    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
+    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
+    return SequenceData(token_ids)
+def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
+    if vlm_config.image_processor is None:
+        values_dtype = torch.float16
+    else:
+        values_dtype = torch.uint8
+    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
+def get_dummy_image_data(
+    seq_len: int,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Tuple[SequenceData, MultiModalData]:
+    """Standard dummy data factory for image data (to be used in
+    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
+    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
+    values = _get_dummy_values(vlm_config)
+    config_input_type = vlm_config.image_input_type
+    ImageInputType = VisionLanguageConfig.ImageInputType
+    fake_mm_data: MultiModalData
+    if config_input_type == ImageInputType.PIXEL_VALUES:
+        fake_mm_data = ImagePixelData(values)
+    elif config_input_type == ImageInputType.IMAGE_FEATURES:
+        fake_mm_data = ImageFeatureData(values)
+    else:
+        raise NotImplementedError
+    return seq_data, fake_mm_data
+class ImagePixelData(MultiModalData):
+    """
+    The pixel data of an image. Can be one of:
+    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+      processor is available to the model.
+    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+      without additional pre-processing.
+    """
+    def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
+        if isinstance(image, Image.Image):
+            # So that this class can be created inside the Image context manager
+            image.load()
+        self.image = image
+class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
+    def get_data_type(self) -> Type[ImagePixelData]:
+        return ImagePixelData
+    def _get_hf_image_processor(self, model_config: ModelConfig,
+                                vlm_config: VisionLanguageConfig):
+        if vlm_config is None or vlm_config.image_processor is None:
+            return None
+        return cached_get_image_processor(
+            vlm_config.image_processor,
+            trust_remote_code=model_config.trust_remote_code,
+            revision=vlm_config.image_processor_revision,
+        )
+    def _default_input_processor(
+            self, data: ImagePixelData, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+        image = data.image
+        image_processor = self._get_hf_image_processor(model_config,
+                                                       vlm_config)
+        if isinstance(image, Image.Image):
+            if image_processor is None:
+                raise RuntimeError("No HuggingFace processor is available"
+                                   "to process the image object")
+            try:
+                return image_processor.preprocess(image, return_tensors="pt") \
+                    .to(model_config.dtype).data
+            except Exception:
+                logger.error("Failed to process image (%s)", image)
+                raise
+        elif isinstance(image, torch.Tensor):
+            pixel_values = image.to(model_config.dtype)
+            return {"pixel_values": pixel_values}
+        raise TypeError(f"Invalid image type: {type(image)}")
+class ImageFeatureData(MultiModalData):
+    """
+    The feature vector of an image, passed directly to the model.
+    This should be the output of the vision tower.
+    """
+    def __init__(self, image_features: torch.Tensor) -> None:
+        self.image_features = image_features
+class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
+    def get_data_type(self) -> Type[ImageFeatureData]:
+        return ImageFeatureData
+    def _default_input_processor(
+            self, data: ImageFeatureData, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+        image_features = data.image_features.to(model_config.dtype)
+        return {"image_features": image_features}
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
+import functools
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
+                    Tuple, Type, TypeVar)
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+from .base import MultiModalData, MultiModalPlugin
+from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
+                    ImagePixelPlugin)
+if TYPE_CHECKING:
+    import torch
+    from torch import nn
+    from vllm.sequence import SequenceData
+logger = init_logger(__name__)
+D = TypeVar("D", bound=MultiModalData)
+N = TypeVar("N", bound=Type["nn.Module"])
+MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
+                                    Dict[str, "torch.Tensor"]]
+MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
+                                  Tuple["SequenceData", MultiModalData]]
+class MultiModalRegistry:
+    """
+    This registry is used by model runners to dispatch data processing
+    according to its modality and the target model.
+    """
+    DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
+    def __init__(self,
+                 *,
+                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
+                 ) -> None:
+        self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
+        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
+                                                  MultiModalDummyFactory] = {}
+    def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
+        data_type = plugin.get_data_type()
+        if data_type in self._plugins_by_data_type:
+            logger.warning(
+                "A plugin is already registered for data type %s, "
+                "and will be overwritten by the new plugin %s.", data_type,
+                plugin)
+        self._plugins_by_data_type[data_type] = plugin
+    def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
+        for typ in data_type.mro():
+            plugin = self._plugins_by_data_type.get(typ)
+            if plugin is not None:
+                return plugin
+        msg = f"Unknown multi-modal data type: {data_type}"
+        raise NotImplementedError(msg)
+    def register_dummy_data(self, factory: MultiModalDummyFactory):
+        """
+        Register a dummy data factory to a model class.
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The modality and shape of
+        the dummy data should be an upper bound of what the model would receive
+        at inference time.
+        """
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+            self._dummy_factories_by_model_type[model_cls] = factory
+            return model_cls
+        return wrapper
+    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
+                                 vlm_config: VisionLanguageConfig):
+        """Create dummy data for memory profiling."""
+        model_cls = MultiModalPlugin.get_model_cls(model_config)
+        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
+        if dummy_factory is None:
+            msg = f"No dummy data defined for model class: {model_cls}"
+            raise NotImplementedError(msg)
+        return dummy_factory(seq_len, model_config, vlm_config)
+    def register_input(
+            self,
+            data_type: Type[D],
+            processor: Optional[MultiModalInputProcessor[D]] = None):
+        """
+        Register an input processor for a specific modality to a model class.
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self._get_plugin_for_data_type(data_type) \
+            .register_input_processor(processor)
+    def register_image_pixel_input(
+            self,
+            processor: Optional[
+                MultiModalInputProcessor[ImagePixelData]] = None):
+        """
+        Register an input processor for image pixel data to a model class.
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self.register_input(ImagePixelData, processor)
+    def register_image_feature_input(
+        self,
+        processor: Optional[
+            MultiModalInputProcessor[ImageFeatureData]] = None):
+        """
+        Register an input processor for image feature data to a model class.
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self.register_input(ImageFeatureData, processor)
+    def process_input(self, data: MultiModalData, model_config: ModelConfig,
+                      vlm_config: VisionLanguageConfig):
+        """
+        Apply an input processor to a :class:`~MultiModalData` instance passed
+        to the model.
+        See :meth:`MultiModalPlugin.process_input` for more details.
+        """
+        return self._get_plugin_for_data_type(type(data)) \
+            .process_input(data, model_config, vlm_config)
+    def create_input_processor(self, model_config: ModelConfig,
+                               vlm_config: VisionLanguageConfig):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input,
+                                 model_config=model_config,
+                                 vlm_config=vlm_config)
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""The global :class:`~MultiModalRegistry` which is used by model runners."""
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,6 +5,8 @@ from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+import torch
 from vllm.block import LogicalTokenBlock
 from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
@@ -12,8 +14,7 @@ from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 if TYPE_CHECKING:
-    import torch
+    from vllm.multimodal import MultiModalData
    from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
@@ -398,25 +399,6 @@ class SequenceGroupState:
    generator: Optional = None  # type: ignore
-class MultiModalData:
-    """Multi modal request.
-    Args:
-        type: The data type.
-        data: The actual data.
-        The required shape and semantic meaning of it depends on the vision
-        language config of the hosted model.
-        See `VisionLanguageConfig` in `config.py`.
-    """
-    class Type(enum.Enum):
-        IMAGE = enum.auto()
-    def __init__(self, type: Type, data: "torch.Tensor"):
-        self.type = type
-        self.data = data
 class SequenceGroup:
    """A group of sequences that are generated from the same prompt.
@@ -473,7 +455,7 @@ class SequenceGroup:
        return next(iter(self.seqs_dict.values())).prompt_token_ids
    @property
-    def multi_modal_data(self) -> Optional[MultiModalData]:
+    def multi_modal_data(self) -> Optional["MultiModalData"]:
        # All sequences in the group should have the same multi-modal data.
        # We use the multi-modal data of an arbitrary sequence.
        return next(iter(self.seqs_dict.values())).multi_modal_data
@@ -655,7 +637,7 @@ class SequenceGroupMetadata:
        lora_request: Optional[LoRARequest] = None,
        computed_block_nums: Optional[List[int]] = None,
        state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional["MultiModalData"] = None,
        encoder_seq_data: Optional[SequenceData] = None,
        cross_block_table: Optional[List[int]] = None,
    ) -> None:
@@ -798,13 +780,13 @@ class SamplerOutput:
    outputs: List[CompletionSequenceGroupOutput]
    # On-device tensor containing probabilities of each token.
-    sampled_token_probs: Optional["torch.Tensor"] = None
+    sampled_token_probs: Optional[torch.Tensor] = None
    # On-device tensor containing the logprobs of each token.
    logprobs: Optional["torch.Tensor"] = None
    # On-device tensor containing the sampled token ids.
-    sampled_token_ids: Optional["torch.Tensor"] = None
+    sampled_token_ids: Optional[torch.Tensor] = None
    # Spec decode metrics populated by workers.
    spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None

--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
+from functools import lru_cache
+from typing import Optional
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+def get_image_processor(
+    processor_name: str,
+    *args,
+    trust_remote_code: bool = False,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> BaseImageProcessor:
+    """Gets an image processor for the given model name via HuggingFace."""
+    try:
+        processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the image processor. If the image processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    return processor
+cached_get_image_processor = lru_cache(get_image_processor)
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
-from typing import List, Optional, Tuple
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
 import torch
 from torch import nn
@@ -11,6 +12,7 @@ from vllm.distributed import broadcast_tensor_dict
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
@@ -63,6 +65,16 @@ class CPUModelRunner:
            self.block_size,
        )
+        # Create processor for multi-modal data
+        if self.vision_language_config is not None:
+            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+                .create_input_processor(
+                    self.model_config,
+                    self.vision_language_config,
+                )
+        else:
+            self.multi_modal_input_processor = None
        # Lazy initialization.
        self.model: nn.Module  # Set after init_Model
@@ -80,14 +92,15 @@ class CPUModelRunner:
    def _prepare_prompt(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
-               Optional[torch.Tensor]]:
+            str, torch.Tensor]]:
        assert len(seq_group_metadata_list) > 0
        input_tokens: List[int] = []
        input_positions: List[int] = []
        slot_mapping: List[int] = []
        seq_lens: List[int] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_kwargs_list: Dict[str,
+                                      List[torch.Tensor]] = defaultdict(list)
        for seq_group_metadata in seq_group_metadata_list:
            assert seq_group_metadata.is_prompt
@@ -108,9 +121,17 @@ class CPUModelRunner:
            # is always the first token in the sequence.
            input_positions.extend(list(range(computed_len, seq_len)))
-            if seq_group_metadata.multi_modal_data:
+            mm_data = seq_group_metadata.multi_modal_data
-                multi_modal_input_list.append(
+            if mm_data is not None:
-                    seq_group_metadata.multi_modal_data.data)
+                # Process multi-modal data
+                if self.multi_modal_input_processor is None:
+                    raise ValueError(
+                        "Multi-modal inputs are only supported by "
+                        "vision language models.")
+                mm_kwargs = self.multi_modal_input_processor(mm_data)
+                for k, v in mm_kwargs.items():
+                    multi_modal_kwargs_list[k].append(v)
            # Compute the slot mapping.
            block_table = seq_group_metadata.block_tables[seq_id]
@@ -134,14 +155,10 @@ class CPUModelRunner:
                slot = block_number * self.block_size + block_offset
                slot_mapping.append(slot)
-        if multi_modal_input_list:
+        multi_modal_kwargs = {
-            assert self.vision_language_config, (
+            k: torch.cat(v, dim=0).to(self.device)
-                "Multi-modal inputs are only supported by "
+            for k, v in multi_modal_kwargs_list.items()
-                "vision language models.")
+        }
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
        num_prompt_tokens = len(input_tokens)
@@ -167,7 +184,7 @@ class CPUModelRunner:
            slot_mapping=slot_mapping,
        )
        return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_input)
+                multi_modal_kwargs)
    def _prepare_decode(
        self,
@@ -257,8 +274,8 @@ class CPUModelRunner:
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[torch.Tensor]]:
+               Optional[Dict[str, torch.Tensor]]]:
-        multi_modal_input = None
+        multi_modal_kwargs = None
        if self.is_driver_worker:
            # NOTE: We assume that all sequences in the group are all prompts or
            # all decodes.
@@ -266,7 +283,7 @@ class CPUModelRunner:
            # Prepare input tensors.
            if is_prompt:
                (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_input
+                 multi_modal_kwargs
                 ) = self._prepare_prompt(seq_group_metadata_list)
            else:
                (input_tokens, input_positions,
@@ -307,7 +324,7 @@ class CPUModelRunner:
            )
        return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_input)
+                sampling_metadata, multi_modal_kwargs)
    @torch.inference_mode()
    def execute_model(

--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -90,7 +90,7 @@ class EmbeddingModelRunner(ModelRunner):
        self,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
        if self.is_driver_worker:
            assert seq_group_metadata_list is not None
            # Prepare input tensors.
@@ -102,7 +102,7 @@ class EmbeddingModelRunner(ModelRunner):
                _,
                lora_mapping,
                lora_requests,
-                multi_modal_input,
+                multi_modal_kwargs,
                slot_mapping,
                num_prefill_tokens,
                num_decode_tokens,
@@ -117,7 +117,7 @@ class EmbeddingModelRunner(ModelRunner):
                "input_positions": input_positions,
                "lora_requests": lora_requests,
                "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
+                "multi_modal_kwargs": multi_modal_kwargs,
                "num_prefill_tokens": num_prefill_tokens,
                "num_decode_tokens": num_decode_tokens,
                "slot_mapping": slot_mapping,
@@ -132,7 +132,7 @@ class EmbeddingModelRunner(ModelRunner):
            input_positions = metadata_dict.pop("input_positions")
            lora_mapping = metadata_dict.pop("lora_mapping")
            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
            if metadata_dict:
                attn_metadata = self.attn_backend.make_metadata(
                    **metadata_dict)
@@ -143,7 +143,7 @@ class EmbeddingModelRunner(ModelRunner):
                                               prompt_lens=None)
        return (input_tokens, input_positions, attn_metadata, pooling_metadata,
-                lora_requests, lora_mapping, multi_modal_input)
+                lora_requests, lora_mapping, multi_modal_kwargs)
    def _prepare_pooling(
        self,

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
 import time
 import warnings
+from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
 import numpy as np
@@ -18,9 +19,9 @@ from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-                           SequenceGroupMetadata)
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                        is_pin_memory_available, make_tensor_with_pad)
@@ -44,7 +45,7 @@ class ModelInput(NamedTuple):
    query_lens: List[int]
    lora_mapping: Optional[LoRAMapping]
    lora_requests: Set[LoRARequest]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Dict[str, torch.Tensor]
    slot_mapping: torch.Tensor
    num_prefill_tokens: int
    num_decode_tokens: int
@@ -60,7 +61,7 @@ class ModelInput(NamedTuple):
            query_lens=[],
            lora_mapping=None,
            lora_requests=set(),
-            multi_modal_input=None,
+            multi_modal_kwargs={},
            slot_mapping=torch.empty(0, device=device),
            num_prefill_tokens=0,
            num_decode_tokens=0,
@@ -122,6 +123,16 @@ class ModelRunner:
            self.block_size,
        )
+        # Create processor for multi-modal data
+        if self.vision_language_config is not None:
+            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+                .create_input_processor(
+                    self.model_config,
+                    self.vision_language_config,
+                )
+        else:
+            self.multi_modal_input_processor = None
        # Lazy initialization
        self.model: nn.Module  # Set after load_model
        # Set if the backend is flashinfer.
@@ -242,7 +253,8 @@ class ModelRunner:
        context_lens: List[int] = []
        query_lens: List[int] = []
        block_tables: List[List[int]] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_kwargs_list: Dict[str,
+                                      List[torch.Tensor]] = defaultdict(list)
        decode_only = True
        num_prefills = 0
        num_prefill_tokens = 0
@@ -417,9 +429,17 @@ class ModelRunner:
                     and seq_group_metadata.sampling_params.prompt_logprobs
                     else 1))
-                if seq_group_metadata.multi_modal_data:
+                mm_data = seq_group_metadata.multi_modal_data
-                    multi_modal_input_list.append(
+                if mm_data is not None:
-                        seq_group_metadata.multi_modal_data.data)
+                    # Process multi-modal data
+                    if self.multi_modal_input_processor is None:
+                        raise ValueError(
+                            "Multi-modal inputs are only supported by "
+                            "vision language models.")
+                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    for k, v in mm_kwargs.items():
+                        multi_modal_kwargs_list[k].append(v)
                if _is_block_tables_empty(seq_group_metadata.block_tables):
                    # During memory profiling, the block tables are not
@@ -508,16 +528,6 @@ class ModelRunner:
        context_lens_tensor = torch.tensor(context_lens,
                                           dtype=torch.int,
                                           device=self.device)
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
        query_lens_tensor = torch.tensor(query_lens,
                                         dtype=torch.long,
                                         device=self.device)
@@ -614,6 +624,11 @@ class ModelRunner:
        else:
            lora_mapping = None
+        multi_modal_kwargs = {
+            k: torch.cat(v, dim=0).to(self.device)
+            for k, v in multi_modal_kwargs_list.items()
+        }
        return ModelInput(
            input_tokens=input_tokens_tensor,
            input_positions=input_positions_tensor,
@@ -622,7 +637,7 @@ class ModelRunner:
            query_lens=query_lens,
            lora_mapping=lora_mapping,
            lora_requests=lora_requests,
-            multi_modal_input=multi_modal_input,
+            multi_modal_kwargs=multi_modal_kwargs,
            slot_mapping=slot_mapping_tensor,
            num_prefill_tokens=num_prefill_tokens,
            num_decode_tokens=num_decode_tokens,
@@ -633,7 +648,7 @@ class ModelRunner:
        self,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
        if self.is_driver_worker:
            assert seq_group_metadata_list is not None
            # Prepare input tensors.
@@ -645,7 +660,7 @@ class ModelRunner:
                query_lens,
                lora_mapping,
                lora_requests,
-                multi_modal_input,
+                multi_modal_kwargs,
                slot_mapping,
                num_prefill_tokens,
                num_decode_tokens,
@@ -662,7 +677,7 @@ class ModelRunner:
                sampling_metadata.selected_token_indices,
                "lora_requests": lora_requests,
                "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
+                "multi_modal_kwargs": multi_modal_kwargs,
                "num_prefill_tokens": num_prefill_tokens,
                "num_decode_tokens": num_decode_tokens,
                "slot_mapping": slot_mapping,
@@ -679,7 +694,7 @@ class ModelRunner:
                "selected_token_indices")
            lora_mapping = metadata_dict.pop("lora_mapping")
            lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
            if metadata_dict:
                attn_metadata = self.attn_backend.make_metadata(
                    **metadata_dict)
@@ -694,7 +709,7 @@ class ModelRunner:
        return (input_tokens, input_positions, attn_metadata,
                sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_input)
+                multi_modal_kwargs)
    @torch.inference_mode()
    def execute_model(
@@ -703,7 +718,7 @@ class ModelRunner:
        kv_caches: List[torch.Tensor],
    ) -> Optional[SamplerOutput]:
        (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
+         lora_requests, lora_mapping, multi_modal_kwargs
         ) = self.prepare_input_tensors(seq_group_metadata_list)
        if self.lora_config:
@@ -717,15 +732,14 @@ class ModelRunner:
            model_executable = self.graph_runners[graph_batch_size]
        else:
            model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
+        hidden_states = model_executable(
-            "positions": input_positions,
+            input_ids=input_tokens,
-            "kv_caches": kv_caches,
+            positions=input_positions,
-            "attn_metadata": attn_metadata,
+            kv_caches=kv_caches,
-        }
+            attn_metadata=attn_metadata,
-        if self.vision_language_config:
+            **multi_modal_kwargs,
-            execute_model_kwargs.update({"image_input": multi_modal_input})
+        )
-        hidden_states = model_executable(**execute_model_kwargs)
        # Compute the logits.
        logits = self.model.compute_logits(hidden_states, sampling_metadata)
@@ -781,16 +795,24 @@ class ModelRunner:
        # To exercise the worst scenario for GPU memory consumption,
        # the number of seqs (batch_size) is chosen to maximize the number
        # of images processed.
-        if self.vision_language_config:
+        model_config = self.model_config
+        vlm_config = self.vision_language_config
+        if vlm_config:
            max_num_seqs = min(
                max_num_seqs,
-                int(max_num_batched_tokens /
+                int(max_num_batched_tokens / vlm_config.image_feature_size))
-                    self.vision_language_config.image_feature_size))
        for group_id in range(max_num_seqs):
            seq_len = (max_num_batched_tokens // max_num_seqs +
                       (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
-                seq_len, self.vision_language_config)
+            if vlm_config is None:
+                seq_data = SequenceData([0] * seq_len)
+                dummy_multi_modal_data = None
+            else:
+                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
+                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
            seq = SequenceGroupMetadata(
                request_id=str(group_id),
                is_prompt=True,
@@ -799,7 +821,7 @@ class ModelRunner:
                block_tables=None,
                lora_request=dummy_lora_requests_per_seq[group_id]
                if dummy_lora_requests_per_seq else None,
-                multi_modal_data=fake_multi_modal_input,
+                multi_modal_data=dummy_multi_modal_data,
            )
            seqs.append(seq)
@@ -1034,24 +1056,6 @@ def _get_graph_batch_size(batch_size: int) -> int:
                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
-def _prepare_fake_inputs(
-        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
-    """Prepare fake inputs for profile run."""
-    if vision_language_config:
-        prompt_tokens = [
-            vision_language_config.image_token_id
-        ] * vision_language_config.image_feature_size + [0] * (
-            seq_len - vision_language_config.image_feature_size)
-        fake_image_input = MultiModalData(
-            type=MultiModalData.Type.IMAGE,
-            data=torch.zeros(vision_language_config.image_input_shape,
-                             dtype=torch.float16))
-    else:
-        prompt_tokens = [0] * seq_len
-        fake_image_input = None
-    return SequenceData(prompt_tokens), fake_image_input
 def _is_block_tables_empty(block_tables: Union[None, Dict]):
    """
    Check if block_tables is None or a dictionary with all None values.