[VLM] Remove `image_input_type` from VLM config (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>

[VLM] Remove `image_input_type` from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
98d6682c · xwjiang2010 · GitHub · 2c37540a · 98d6682c · 98d6682c
Unverified Commit 98d6682c authored Jul 02, 2024 by xwjiang2010 Committed by GitHub Jul 02, 2024
20 changed files
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -8,10 +8,6 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg


--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
-sphinx == 6.2.1
-sphinx-book-theme == 1.0.1
-sphinx-copybutton == 0.5.2
-myst-parser == 2.0.0
+sphinx==6.2.1
+sphinx-book-theme==1.0.1
+sphinx-copybutton==0.5.2
+myst-parser==2.0.0
 sphinx-argparse
-
-# packages to install to build the documentation
-pydantic
-f https://download.pytorch.org/whl/cpu
-torch
-py-cpuinfo
-transformers
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 which allows you to pass in multi-modal input alongside text and token prompts.

 By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
+
+# TODO: Add more instructions on how to do that once embeddings is in.

 Module Contents
 +++++++++++++++
@@ -29,7 +31,7 @@ Registry
 Base Classes
 ------------

-.. autoclass:: vllm.multimodal.MultiModalData
+.. autoclass:: vllm.multimodal.MultiModalDataDict
    :members:
    :show-inheritance:


--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``

    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
        image_feature_size=576,
@@ -49,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:

 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
+
+.. note::
+
+   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
+    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.

 .. code-block:: python

@@ -61,7 +65,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS

    outputs = llm.generate({
        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {"image": image},
    })

    for o in outputs:
@@ -93,7 +97,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with

    python -m vllm.entrypoints.openai.api_server \
        --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
        --image-token-id 32000 \
        --image-input-shape 1,3,336,336 \
        --image-feature-size 576 \

--- a/examples/llava_example.py
+++ b/examples/llava_example.py
-import argparse
 import os
 import subprocess

-import torch
 from PIL import Image

 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData

 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them


-def run_llava_pixel_values(*, disable_image_processor: bool = False):
+def run_llava():
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
        image_feature_size=576,
-        disable_image_processor=disable_image_processor,
    )

    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = Image.open("images/stop_sign.jpg")

    outputs = llm.generate({
        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {
+            "image": image
+        },
    })

    for o in outputs:
@@ -40,45 +34,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
        print(generated_text)


-def run_llava_image_features():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="image_features",
-        image_token_id=32000,
-        image_input_shape="1,576,1024",
-        image_feature_size=576,
-    )
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_llava_pixel_values()
-    else:
-        run_llava_image_features()
+def main():
+    run_llava()


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on Llava")
-    parser.add_argument("--type",
-                        type=str,
-                        choices=["pixel_values", "image_features"],
-                        default="pixel_values",
-                        help="image input type")
-    args = parser.parse_args()
    # Download from s3
    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
    local_directory = "images"
@@ -95,4 +55,4 @@ if __name__ == "__main__":
        local_directory,
        "--no-sign-request",
    ])
-    main(args)
+    main()
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -4,35 +4,44 @@ import requests
 from PIL import Image

 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData

 # Dynamic image input is currently not supported and therefore
 # a fixed image input shape and its corresponding feature size is required.
 # See https://github.com/vllm-project/vllm/pull/4199 for the complete
 # configuration matrix.

-llm = LLM(
-    model="llava-hf/llava-v1.6-mistral-7b-hf",
-    image_input_type="pixel_values",
-    image_token_id=32000,
-    image_input_shape="1,3,336,336",
-    image_feature_size=1176,
-)
-
-prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
-url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-image = Image.open(BytesIO(requests.get(url).content))
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
-
-outputs = llm.generate(
-    {
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
-    },
-    sampling_params=sampling_params)
-
-generated_text = ""
-for o in outputs:
-    generated_text += o.outputs[0].text
-
-print(f"LLM output:{generated_text}")
+
+def run_llava_next():
+    llm = LLM(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=1176,
+    )
+
+    prompt = "[INST] " + "<image>" * 1176 + (
+        "\nWhat is shown in this image? [/INST]")
+    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+    image = Image.open(BytesIO(requests.get(url).content))
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=100)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            }
+        },
+        sampling_params=sampling_params)
+
+    generated_text = ""
+    for o in outputs:
+        generated_text += o.outputs[0].text
+
+    print(f"LLM output:{generated_text}")
+
+
+if __name__ == "__main__":
+    run_llava_next()
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -3,7 +3,6 @@
 Launch the vLLM server with the following command:
 python -m vllm.entrypoints.openai.api_server \
    --model llava-hf/llava-1.5-7b-hf \
-    --image-input-type pixel_values \
    --image-token-id 32000 \
    --image-input-shape 1,3,336,336 \
    --image-feature-size 576 \

--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -4,7 +4,6 @@ import subprocess
 from PIL import Image

 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData


 def run_phi3v():
@@ -17,7 +16,6 @@ def run_phi3v():
    llm = LLM(
        model=model_path,
        trust_remote_code=True,
-        image_input_type="pixel_values",
        image_token_id=32044,
        image_input_shape="1,3,1008,1344",
        image_feature_size=1921,
@@ -35,7 +33,9 @@ def run_phi3v():
    outputs = llm.generate(
        {
            "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
        },
        sampling_params=sampling_params)
    for o in outputs:

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,19 +17,17 @@ from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
                          AutoTokenizer, BatchEncoding)

 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.config import TokenizerPoolConfig
 from vllm.distributed import (destroy_distributed_environment,
                              destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.sequence import SampleLogprobs
+from vllm.utils import cuda_device_count_stateless, is_cpu

 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
-else:
    # it will call torch.cuda.device_count()
-    MultiModalData = None
-from vllm.sequence import SampleLogprobs
-from vllm.utils import cuda_device_count_stateless, is_cpu
+    from vllm.multimodal import MultiModalDataDict

 logger = init_logger(__name__)

@@ -51,14 +49,6 @@ def _read_prompts(filename: str) -> List[str]:
 class ImageAsset:
    name: Literal["stop_sign", "cherry_blossom"]

-    @cached_property
-    def pixel_values(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
-
-    @cached_property
-    def image_features(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
-
    @cached_property
    def pil_image(self) -> Image.Image:
        return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
@@ -66,20 +56,8 @@ class ImageAsset:
    def for_hf(self) -> Image.Image:
        return self.pil_image

-    def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
-        # don't put this import at the top level
-        # it will call torch.cuda.device_count()
-        from vllm.multimodal.image import ImageFeatureData  # noqa: F401
-        from vllm.multimodal.image import ImagePixelData
-        image_input_type = vision_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if image_input_type == ImageInputType.IMAGE_FEATURES:
-            return ImageFeatureData(self.image_features)
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            return ImagePixelData(self.pil_image)
-
-        raise NotImplementedError
+    def for_vllm(self) -> Dict[str, Any]:
+        return {"image": self.pil_image}


 class _ImageAssetPrompts(TypedDict):
@@ -453,7 +431,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images is not None:
            assert len(prompts) == len(images)
@@ -502,7 +480,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -39,8 +39,6 @@ def server():
        "--max-model-len",
        "4096",
        "--enforce-eager",
-        "--image-input-type",
-        "pixel_values",
        "--image-token-id",
        "32000",
        "--image-input-shape",

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -25,17 +25,11 @@ def iter_llava_configs(model_name: str):
    }

    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_feature_size=f,
+                                    image_token_id=32000,
+                                    image_input_shape=input_shape))


 model_and_vl_config = [
@@ -81,8 +75,8 @@ def run_test(

    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -104,7 +98,7 @@ def run_test(
        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
        # we must put it inside the vllm_runner context manager
        # i.e. after creating vLLM instance.
-        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+        vllm_images = [asset.for_vllm() for asset in image_assets]

        vllm_image_prompts = [
            p.replace("<image>", "<image>" * vlm_config.image_feature_size)

--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -33,16 +33,13 @@ def iter_llava_next_configs(model_name: str):
    }

    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(
+                   image_feature_size=f,
+                   image_token_id=32000,
+                   image_input_shape=input_shape,
+               ))


 model_and_vl_config = [
@@ -85,14 +82,14 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,

    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    model_id, vlm_config = model_and_config
    hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    vllm_images = [asset.for_vllm() for asset in image_assets]

    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,

--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -27,16 +27,11 @@ def iter_phi3v_configs(model_name: str):
    }

    for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32044,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_feature_size=f,
+                                    image_token_id=32044,
+                                    image_input_shape=input_shape))


 model_and_vl_config = [
@@ -89,8 +84,8 @@ def run_test(

    All the image fixtures for the test is under tests/images.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
@@ -113,7 +108,7 @@ def run_test(
        # we must put it inside the vllm_runner context manager
        # i.e. after creating vLLM instance.

-        vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+        vllm_images = [asset.for_vllm() for asset in image_assets]

        vllm_image_prompts = [
            p.replace("<|image_1|>",

--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,9 +2,8 @@ import numpy as np
 import pytest
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor

-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData

 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE

@@ -12,7 +11,6 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)
@@ -25,14 +23,6 @@ def test_clip_image_processor(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
    )

    for asset in image_assets:
@@ -42,7 +32,7 @@ def test_clip_image_processor(image_assets, dtype):
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )

        assert hf_result.keys() == vllm_result.keys()
@@ -60,7 +50,6 @@ def test_clip_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_llava_next_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -73,14 +62,6 @@ def test_llava_next_image_processor(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=64000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=2928,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
    )

    for asset in image_assets:
@@ -90,7 +71,7 @@ def test_llava_next_image_processor(image_assets, dtype):
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )

        assert hf_result.keys() == vllm_result.keys()
@@ -107,7 +88,6 @@ def test_llava_next_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    model_config = ModelConfig(
        model=MODEL_NAME,
@@ -117,23 +97,15 @@ def test_image_pixel_types(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ))
-
+    )
    for asset in image_assets:
        image_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )
        tensor_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pixel_values),
+            {"image": asset.pil_image},
        )

        assert image_result.keys() == tensor_result.keys()

--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -11,7 +11,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import MultiModalData
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
@@ -91,7 +91,7 @@ class AsyncLLM:
        prompt_token_ids: Optional[List[List[int]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional[MultiModalDataDict] = None,
    ) -> List[RequestOutput]:

        if prompts is None:

--- a/tests/tokenization/test_image_processor.py
+++ b/tests/tokenization/test_image_processor.py
-import pytest
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.transformers_utils.image_processor import get_image_processor
-
-IMAGE_PROCESSOR_NAMES = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-34b-hf",
-]
-
-
-@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
-def test_image_processor_revision(processor_name: str):
-    # Assume that "main" branch always exists
-    image_processor = get_image_processor(processor_name, revision="main")
-    assert isinstance(image_processor, BaseImageProcessor)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_image_processor(processor_name, revision="never")
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1250,28 +1250,11 @@ class LoRAConfig:
            raise ValueError("LoRA is not supported with chunked prefill yet.")


+# TODO: To be replaced by MultiModalConfig.
 @dataclass
 class VisionLanguageConfig:
    """Configs the input data format and how models should run for
    vision language models."""
-
-    class ImageInputType(enum.Enum):
-        """Image input type into the vision language model.
-
-        An image roughly goes through the following transformation:
-        Raw image --> pixel values --> image features --> image embeddings.
-
-        The difference between different image input types is where the
-        image encoder (pixel values --> image features) is run.
-        Different image input types also correspond to different tensor shapes.
-
-        For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
-        IMAGE_FEATURES: (1, 576, 1024).
-        """
-        PIXEL_VALUES = enum.auto()
-        IMAGE_FEATURES = enum.auto()
-
-    image_input_type: ImageInputType
    # The input id corresponding to image token.
    image_token_id: int
    # Used for running `run_prefill_max_token`.
@@ -1279,19 +1262,6 @@ class VisionLanguageConfig:
    # worst case scenario (biggest supported resolution).
    image_input_shape: tuple
    image_feature_size: int
-    # The image processor to load from HuggingFace
-    image_processor: Optional[str]
-    image_processor_revision: Optional[str]
-
-    @classmethod
-    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
-        """Get the image input type from a string."""
-        try:
-            return cls.ImageInputType[value.upper()]
-        except KeyError as e:
-            raise ValueError(f"{value} is not a valid choice. "
-                             f"Expecting to choose from "
-                             f"{[x.name for x in cls.ImageInputType]}.") from e

    #TODO(ywang96): make this a cached property once we refactor the
    # VisionLanguageConfig class.
@@ -1318,8 +1288,6 @@ class VisionLanguageConfig:
            else:
                result[f.name] = value

-        result["disable_image_processor"] = self.image_processor is None
-
        return result



--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
 import argparse
 import dataclasses
 import json
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union

@@ -80,13 +79,9 @@ class EngineArgs:
    preemption_mode: Optional[str] = None

    # Related to Vision-language models such as llava
-    image_input_type: Optional[str] = None
    image_token_id: Optional[int] = None
    image_input_shape: Optional[str] = None
    image_feature_size: Optional[int] = None
-    image_processor: Optional[str] = None
-    image_processor_revision: Optional[str] = None
-    disable_image_processor: bool = False

    scheduler_delay_factor: float = 0.0
    enable_chunked_prefill: bool = False
@@ -114,14 +109,6 @@ class EngineArgs:
    @staticmethod
    def add_cli_args_for_vlm(
            parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument('--image-input-type',
-                            type=nullable_str,
-                            default=None,
-                            choices=[
-                                t.name.lower()
-                                for t in VisionLanguageConfig.ImageInputType
-                            ],
-                            help=('The image input type passed into vLLM.'))
        parser.add_argument('--image-token-id',
                            type=int,
                            default=None,
@@ -137,24 +124,6 @@ class EngineArgs:
            type=int,
            default=None,
            help=('The image feature size along the context dimension.'))
-        parser.add_argument(
-            '--image-processor',
-            type=str,
-            default=EngineArgs.image_processor,
-            help='Name or path of the huggingface image processor to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            '--image-processor-revision',
-            type=str,
-            default=None,
-            help='Revision of the huggingface image processor version to use. '
-            'It can be a branch name, a tag name, or a commit id. '
-            'If unspecified, will use the default version.')
-        parser.add_argument(
-            '--disable-image-processor',
-            action='store_true',
-            help='Disables the use of image processor, even if one is defined '
-            'for the model on huggingface.')

        return parser

@@ -679,33 +648,16 @@ class EngineArgs:
            raise ValueError(
                "BitsAndBytes load format and QLoRA adapter only support "
                f"'bitsandbytes' quantization, but got {self.quantization}")
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
+        if self.image_token_id is not None:
+            if (not self.image_input_shape or not self.image_feature_size):
                raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
+                    'Specify `image_input_shape` and '
+                    '`image_feature_size` together with `image_token_id`.')

            vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
                image_token_id=self.image_token_id,
                image_input_shape=str_to_int_tuple(self.image_input_shape),
                image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
            )
        else:
            vision_language_config = None

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -213,15 +213,6 @@ if __name__ == "__main__":

    engine_args = AsyncEngineArgs.from_cli_args(args)

-    # Enforce pixel values as image input type for vision language models
-    # when serving with API server
-    if engine_args.image_input_type is not None and \
-        engine_args.image_input_type.upper() != "PIXEL_VALUES":
-        raise ValueError(
-            f"Invalid image_input_type: {engine_args.image_input_type}. "
-            "Only --image-input-type 'pixel_values' is supported for serving "
-            "vision language models with the vLLM API server.")
-
    engine = AsyncLLMEngine.from_engine_args(
        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)


--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@ from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
    get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import (async_get_and_parse_image,
                                   get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
@@ -47,7 +47,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
    messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImagePixelData]] = field(
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
        default_factory=list)


@@ -103,7 +103,7 @@ class OpenAIServingChat(OpenAIServing):
        parts: Iterable[ChatCompletionContentPartParam],
    ) -> ChatMessageParseResult:
        texts: List[str] = []
-        image_futures: List[Awaitable[ImagePixelData]] = []
+        mm_futures: List[Awaitable[MultiModalDataDict]] = []

        vlm_config: Optional[VisionLanguageConfig] = getattr(
            self.engine.engine, "vision_language_config", None)
@@ -113,39 +113,34 @@ class OpenAIServingChat(OpenAIServing):
            part_type = part["type"]
            if part_type == "text":
                text = cast(ChatCompletionContentPartTextParam, part)["text"]
-
                texts.append(text)
            elif part_type == "image_url":
                if vlm_config is None:
                    raise ValueError(
                        "'image_url' input is not supported as the loaded "
                        "model is not multimodal.")
+                assert self.tokenizer is not None
+                image_url = cast(ChatCompletionContentPartImageParam,
+                                 part)["image_url"]

-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
-
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
+                if image_url.get("detail", "auto") != "auto":
+                    logger.warning(
+                        "'image_url.detail' is currently not supported and "
+                        "will be ignored.")

-                else:
-                    raise NotImplementedError(
-                        "Multiple 'image_url' input is currently not supported."
-                    )
+                mm_future = async_get_and_parse_image(image_url["url"])
+                mm_futures.append(mm_future)

            else:
                raise NotImplementedError(f"Unknown part type: {part_type}")

        text_prompt = "\n".join(texts)

-        if vlm_config is not None and len(image_futures):
+        if vlm_config is not None and len(mm_futures):

+            assert len(
+                mm_futures
+            ) == 1, "Multiple 'image_url' input is currently not supported."
            (image_token_prompt,
             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)

@@ -171,8 +166,7 @@ class OpenAIServingChat(OpenAIServing):
        else:
            messages = [ConversationMessage(role=role, content=text_prompt)]

-        return ChatMessageParseResult(messages=messages,
-                                      image_futures=image_futures)
+        return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)

    def _parse_chat_message_content(
        self,
@@ -182,10 +176,10 @@ class OpenAIServingChat(OpenAIServing):
        content = message.get("content")

        if content is None:
-            return ChatMessageParseResult(messages=[], image_futures=[])
+            return ChatMessageParseResult(messages=[], mm_futures=[])
        if isinstance(content, str):
            messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, image_futures=[])
+            return ChatMessageParseResult(messages=messages, mm_futures=[])

        return self._parse_chat_message_content_parts(role, content)

@@ -210,13 +204,13 @@ class OpenAIServingChat(OpenAIServing):

        try:
            conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImagePixelData]] = []
+            mm_futures: List[Awaitable[MultiModalDataDict]] = []

            for msg in request.messages:
                chat_parsed_result = self._parse_chat_message_content(msg)

                conversation.extend(chat_parsed_result.messages)
-                image_futures.extend(chat_parsed_result.image_futures)
+                mm_futures.extend(chat_parsed_result.mm_futures)

            tool_dicts = None if request.tools is None else [
                tool.model_dump() for tool in request.tools
@@ -235,15 +229,14 @@ class OpenAIServingChat(OpenAIServing):
            logger.error("Error in applying chat template from request: %s", e)
            return self.create_error_response(str(e))

-        # Fetch image data
-        image_data: Optional[ImagePixelData] = None
+        mm_data: Optional[MultiModalDataDict] = None
        try:
-            if len(image_futures):
-                # since we support only single image currently
-                assert len(image_futures) == 1
-                image_data = await image_futures[0]
+            if len(mm_futures):
+                # since we support only single mm data currently
+                assert len(mm_futures) == 1
+                mm_data = await mm_futures[0]
        except Exception as e:
-            logger.error("Error in loading image data: %s", e)
+            logger.error("Error in loading multi-modal data: %s", e)
            return self.create_error_response(str(e))

        request_id = f"cmpl-{random_uuid()}"
@@ -274,8 +267,8 @@ class OpenAIServingChat(OpenAIServing):
            "prompt": prompt_text,
            "prompt_token_ids": prompt_ids,
        }
-        if image_data is not None:
-            inputs["multi_modal_data"] = image_data
+        if mm_data is not None:
+            inputs["multi_modal_data"] = mm_data

        is_tracing_enabled = await self.engine.is_tracing_enabled()
        trace_headers = None