[Core] Support image processor (#4197)

7a64d24a · Cyrus Leung · GitHub · dfbe60dc · 7a64d24a · 7a64d24a
Unverified Commit 7a64d24a authored Jun 03, 2024 by Cyrus Leung Committed by GitHub Jun 02, 2024
20 changed files
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -37,6 +37,7 @@ jobs:
        mypy vllm/distributed --config-file pyproject.toml
        mypy vllm/entrypoints --config-file pyproject.toml
        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
        mypy vllm/usage --config-file pyproject.toml
        mypy vllm/*.py --config-file pyproject.toml
        mypy vllm/transformers_utils --config-file pyproject.toml

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -90,6 +90,7 @@ autodoc_mock_imports = [
    "sentencepiece",
    "vllm.cuda_utils",
    "vllm._C",
+    "PIL",
    "numpy",
    "tqdm",
    "tensorizer",
@@ -116,12 +117,13 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
 autodoc.ClassDocumenter = MockedClassDocumenter

 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'typing_extensions':
-    ('https://typing-extensions.readthedocs.io/en/latest', None),
-    'numpy': ('https://numpy.org/doc/stable', None),
-    'torch': ('https://pytorch.org/docs/stable', None),
-    'psutil': ('https://psutil.readthedocs.io/en/stable', None),
+    "python": ("https://docs.python.org/3", None),
+    "typing_extensions":
+    ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "torch": ("https://pytorch.org/docs/stable", None),
+    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
 }

 autodoc_preserve_defaults = True

--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
+Multi-Modality
+==============
+
+.. currentmodule:: vllm.multimodal
+    
+vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
+
+:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
+which allows you to pass in multi-modal input alongside text and token prompts.
+
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
+you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+
+.. contents::
+   :local:
+   :backlinks: none
+
+Module Contents
+++++++++++++++
+
+.. automodule:: vllm.multimodal
+
+Registry
+--------
+
+.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
+
+    The global :class:`MultiModalRegistry` which is used by model runners.
+
+.. autoclass:: vllm.multimodal.MultiModalRegistry
+    :members:
+    :show-inheritance:
+
+Base Classes
+------------
+
+.. autoclass:: vllm.multimodal.MultiModalData
+    :members:
+    :show-inheritance:
+
+.. autoclass:: vllm.multimodal.MultiModalPlugin
+    :members:
+    :show-inheritance:
+
+Image Classes
+-------------
+
+.. automodule:: vllm.multimodal.image
+    :members:
+    :show-inheritance:
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -88,6 +88,7 @@ Documentation
   models/adding_model
   models/engine_args
   models/lora
+   models/vlm
   models/performance

 .. toctree::
@@ -99,17 +100,18 @@ Documentation
   quantization/fp8_e4m3_kvcache

 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
   :caption: Developer Documentation
   
   dev/sampling_params
   dev/offline_inference/offline_index
   dev/engine/engine_index
   dev/kernel/paged_attention
+   dev/multimodal/multimodal_index
   dev/dockerfile/dockerfile

 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
   :caption: Community

   community/meetups

--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it.
    - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
    - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
    - ✅︎
+  * - :code:`LlavaForConditionalGeneration`
+    - LLaVA-1.5
+    - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
+    -
  * - :code:`MiniCPMForCausalLM`
    - MiniCPM
    - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.

--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
+.. _vlm:
+
+Using VLMs
+==========
+
+This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+
+Engine Arguments
+----------------
+
+The following :ref:`engine arguments <engine_args>` are specific to VLMs:
+
+.. argparse::
+    :module: vllm.engine.arg_utils
+    :func: _vlm_engine_args_parser
+    :prog: -m vllm.entrypoints.openai.api_server
+    :nodefaultconst:
+
+Offline Batched Inference
+-------------------------
+
+To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+
+.. code-block:: python
+
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        image_input_type="pixel_values",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=576,
+    )
+
+For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
+
+* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+
+.. code-block:: python
+
+    prompt = "<image>" * 576 + (
+        "\nUSER: What is the content of this image?\nASSISTANT:")
+
+    # Load the image using PIL.Image
+    image = ...
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -3,33 +3,36 @@ import os
 import subprocess

 import torch
+from PIL import Image

 from vllm import LLM
-from vllm.sequence import MultiModalData
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData

 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them


-def run_llava_pixel_values():
+def run_llava_pixel_values(*, disable_image_processor: bool = False):
    llm = LLM(
        model="llava-hf/llava-1.5-7b-hf",
        image_input_type="pixel_values",
        image_token_id=32000,
        image_input_shape="1,3,336,336",
        image_feature_size=576,
+        disable_image_processor=disable_image_processor,
    )

    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

-    # This should be provided by another online or offline component.
+    if disable_image_processor:
        image = torch.load("images/stop_sign_pixel_values.pt")
+    else:
+        image = Image.open("images/stop_sign.jpg")

    outputs = llm.generate({
-        "prompt":
-        prompt,
-        "multi_modal_data":
-        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
    })

    for o in outputs:
@@ -49,15 +52,13 @@ def run_llava_image_features():
    prompt = "<image>" * 576 + (
        "\nUSER: What is the content of this image?\nASSISTANT:")

-    # This should be provided by another online or offline component.
-    image = torch.load("images/stop_sign_image_features.pt")
+    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")

    outputs = llm.generate({
-        "prompt":
-        prompt,
-        "multi_modal_data":
-        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+        "prompt": prompt,
+        "multi_modal_data": ImageFeatureData(image),
    })
+
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)

--- a/format.sh
+++ b/format.sh
@@ -101,6 +101,7 @@ mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
+mypy vllm/multimodal --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/*.py --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml

--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -12,6 +12,7 @@ aiohttp
 openai
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer

--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -33,8 +33,5 @@ sentence-transformers # required for embedding
 # Benchmarking
 aiohttp

-# Multimodal
-pillow
-
 # quantization
 bitsandbytes==0.42.0
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,7 +15,9 @@ from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
 from vllm.distributed import destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.sequence import MultiModalData, SampleLogprobs
+from vllm.multimodal import MultiModalData
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SampleLogprobs

 logger = init_logger(__name__)

@@ -24,6 +26,7 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

 # Multi modal related
+# You can use `.buildkite/download-images.sh` to download the assets
 _PIXEL_VALUES_FILES = [
    os.path.join(_TEST_DIR, "images", filename) for filename in
    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
@@ -89,17 +92,23 @@ def hf_images() -> List[Image.Image]:


 @pytest.fixture()
-def vllm_images(request) -> "torch.Tensor":
+def vllm_images(request) -> List[MultiModalData]:
    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    all_images = []
    if vision_language_config.image_input_type == (
            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        filenames = _IMAGE_FEATURES_FILES
+        return [
+            ImageFeatureData(torch.load(filename))
+            for filename in _IMAGE_FEATURES_FILES
+        ]
    else:
-        filenames = _PIXEL_VALUES_FILES
-    for filename in filenames:
-        all_images.append(torch.load(filename))
-    return torch.concat(all_images, dim=0)
+        return [
+            ImagePixelData(Image.open(filename)) for filename in _IMAGE_FILES
+        ]
+
+
+@pytest.fixture()
+def vllm_image_tensors(request) -> List[torch.Tensor]:
+    return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]


 @pytest.fixture()
@@ -392,23 +401,17 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
-        images: Optional[torch.Tensor] = None,
+        images: Optional[List[MultiModalData]] = None,
    ) -> List[Tuple[List[List[int]], List[str]]]:
        if images is not None:
            assert len(prompts) == len(images)

-        prompt_inputs: List[TextPrompt] = []
-        for i, prompt in enumerate(prompts):
-            prompt = TextPrompt(prompt=prompt)
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
-                prompt["multi_modal_data"] = MultiModalData(
-                    type=MultiModalData.Type.IMAGE,
-                    data=images[i:i + 1],
-                )
-
-            prompt_inputs.append(prompt)
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = image

-        req_outputs = self.model.generate(prompt_inputs,
+        req_outputs = self.model.generate(inputs,
                                          sampling_params=sampling_params)

        outputs: List[Tuple[List[List[int]], List[str]]] = []
@@ -447,7 +450,7 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
-        images: Optional[torch.Tensor] = None,
+        images: Optional[List[MultiModalData]] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
        outputs = self.generate(prompts, greedy_params, images=images)

--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
 import gc
 from dataclasses import fields
 from enum import Enum
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple

 import pytest
 import torch
@@ -9,36 +9,50 @@ from transformers import AutoTokenizer

 from vllm.config import VisionLanguageConfig

-model_and_vl_config = [
-    ("llava-hf/llava-1.5-7b-hf",
-     VisionLanguageConfig(
-         image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-         image_feature_size=576,
-         image_token_id=32000,
-         image_input_shape=(1, 3, 336, 336))),
-    ("llava-hf/llava-1.5-7b-hf",
-     VisionLanguageConfig(
-         image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
-         image_feature_size=576,
+
+def iter_llava_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (336, 336): 576,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
                                        image_token_id=32000,
-         image_input_shape=(1, 576, 1024)))
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
+    # Not enough memory
+    # *iter_llava_configs("llava-hf/llava-1.5-13b-hf"),
 ]


-def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
+def as_dict(vlm_config: VisionLanguageConfig) -> Dict[str, Any]:
    """Flatten vision language config to pure args.

    Compatible with what llm entrypoint expects.
    """
    result = {}
-    for field in fields(vision_language_config):
-        value = getattr(vision_language_config, field.name)
+    for field in fields(vlm_config):
+        value = getattr(vlm_config, field.name)
        if isinstance(value, Enum):
            result[field.name] = value.name.lower()
        elif isinstance(value, tuple):
            result[field.name] = ",".join([str(item) for item in value])
        else:
            result[field.name] = value
+
+    result["disable_image_processor"] = vlm_config.image_processor is None
+
    return result


@@ -67,18 +81,19 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
-                vllm_image_prompts, vllm_images, model_and_config: tuple,
-                dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
+                vllm_image_prompts, vllm_images, model_and_config, dtype: str,
+                max_tokens: int, worker_use_ray: bool) -> None:
    """Inference result should be the same between hf and vllm.

    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the raw images as input.
-    For vllm runner, we provide image tensors and corresponding
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
    vision language config as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
    """
    model_id, vision_language_config = model_and_config
+
    hf_model = hf_runner(model_id, dtype=dtype)
    hf_outputs = hf_model.generate_greedy(hf_image_prompts,
                                          max_tokens,
@@ -88,6 +103,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
    vllm_model = vllm_runner(model_id,
                             dtype=dtype,
                             worker_use_ray=worker_use_ray,
+                             enforce_eager=True,
                             **as_dict(vision_language_config))
    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                              max_tokens,
@@ -105,3 +121,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
        assert hf_output_ids == vllm_output_ids, (
            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+# (Requires multiple GPUs)
--- a/tests/multimodal/__init__.py
+++ b/tests/multimodal/__init__.py
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
+import numpy as np
+import pytest
+from transformers import CLIPImageProcessor
+
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import ImagePixelData
+
+
+@pytest.mark.parametrize("dtype", ["half", "bfloat16", "float"])
+def test_clip_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+
+    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, CLIPImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="np",
+        )
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_arr in hf_result.items():
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.parametrize("dtype", ["float"])
+def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image, tensor in zip(hf_images, vllm_image_tensors):
+        image_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        tensor_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(tensor),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert image_result.keys() == tensor_result.keys()
+        for key, image_arr in image_result.items():
+            tensor_arr: np.ndarray = tensor_result[key].numpy()
+
+            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
+
+            # The examples in PR#3042 have slightly different preprocessing from
+            # HuggingFace's LlavaProcessor, causing the test to fail.
+            # assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -18,9 +18,10 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
+from vllm.multimodal import MultiModalData
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, MultiModalData
+from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid


--- a/tests/tokenization/test_image_processor.py
+++ b/tests/tokenization/test_image_processor.py
+import pytest
+from transformers.image_processing_utils import BaseImageProcessor
+
+from vllm.transformers_utils.image_processor import get_image_processor
+
+IMAGE_PROCESSOR_NAMES = [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-34b-hf",
+]
+
+
+@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
+def test_image_processor_revision(processor_name: str):
+    # Assume that "main" branch always exists
+    image_processor = get_image_processor(processor_name, revision="main")
+    assert isinstance(image_processor, BaseImageProcessor)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match='not a valid git identifier'):
+        get_image_processor(processor_name, revision="never")
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1094,10 +1094,12 @@ class VisionLanguageConfig:
    # worst case scenario (biggest supported resolution).
    image_input_shape: tuple
    image_feature_size: int
+    # The image processor to load from HuggingFace
+    image_processor: Optional[str]
+    image_processor_revision: Optional[str]

    @classmethod
-    def get_image_input_enum_type(
-            cls, value: str) -> "VisionLanguageConfig.ImageInputType":
+    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
        """Get the image input type from a string."""
        try:
            return cls.ImageInputType[value.upper()]

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
 import argparse
 import dataclasses
 import json
+import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union

@@ -80,6 +81,10 @@ class EngineArgs:
    image_token_id: Optional[int] = None
    image_input_shape: Optional[str] = None
    image_feature_size: Optional[int] = None
+    image_processor: Optional[str] = None
+    image_processor_revision: Optional[str] = None
+    disable_image_processor: bool = False
+
    scheduler_delay_factor: float = 0.0
    enable_chunked_prefill: bool = False

@@ -98,6 +103,53 @@ class EngineArgs:
        if self.tokenizer is None:
            self.tokenizer = self.model

+    @staticmethod
+    def add_cli_args_for_vlm(
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser.add_argument('--image-input-type',
+                            type=nullable_str,
+                            default=None,
+                            choices=[
+                                t.name.lower()
+                                for t in VisionLanguageConfig.ImageInputType
+                            ],
+                            help=('The image input type passed into vLLM.'))
+        parser.add_argument('--image-token-id',
+                            type=int,
+                            default=None,
+                            help=('Input id for image token.'))
+        parser.add_argument(
+            '--image-input-shape',
+            type=nullable_str,
+            default=None,
+            help=('The biggest image input shape (worst for memory footprint) '
+                  'given an input type. Only used for vLLM\'s profile_run.'))
+        parser.add_argument(
+            '--image-feature-size',
+            type=int,
+            default=None,
+            help=('The image feature size along the context dimension.'))
+        parser.add_argument(
+            '--image-processor',
+            type=str,
+            default=EngineArgs.image_processor,
+            help='Name or path of the huggingface image processor to use. '
+            'If unspecified, model name or path will be used.')
+        parser.add_argument(
+            '--image-processor-revision',
+            type=str,
+            default=None,
+            help='Revision of the huggingface image processor version to use. '
+            'It can be a branch name, a tag name, or a commit id. '
+            'If unspecified, will use the default version.')
+        parser.add_argument(
+            '--disable-image-processor',
+            action='store_true',
+            help='Disables the use of image processor, even if one is defined '
+            'for the model on huggingface.')
+
+        return parser
+
    @staticmethod
    def add_cli_args(
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
@@ -113,7 +165,8 @@ class EngineArgs:
            '--tokenizer',
            type=nullable_str,
            default=EngineArgs.tokenizer,
-            help='Name or path of the huggingface tokenizer to use.')
+            help='Name or path of the huggingface tokenizer to use. '
+            'If unspecified, model name or path will be used.')
        parser.add_argument(
            '--skip-tokenizer-init',
            action='store_true',
@@ -136,9 +189,9 @@ class EngineArgs:
            '--tokenizer-revision',
            type=nullable_str,
            default=None,
-            help='The specific tokenizer version to use. It can be a branch '
-            'name, a tag name, or a commit id. If unspecified, will use '
-            'the default version.')
+            help='Revision of the huggingface tokenizer to use. '
+            'It can be a branch name, a tag name, or a commit id. '
+            'If unspecified, will use the default version.')
        parser.add_argument(
            '--tokenizer-mode',
            type=str,
@@ -445,31 +498,10 @@ class EngineArgs:
                            default=EngineArgs.device,
                            choices=["auto", "cuda", "neuron", "cpu"],
                            help='Device type for vLLM execution.')
+
        # Related to Vision-language models such as llava
-        parser.add_argument(
-            '--image-input-type',
-            type=nullable_str,
-            default=None,
-            choices=[
-                t.name.lower() for t in VisionLanguageConfig.ImageInputType
-            ],
-            help=('The image input type passed into vLLM. '
-                  'Should be one of "pixel_values" or "image_features".'))
-        parser.add_argument('--image-token-id',
-                            type=int,
-                            default=None,
-                            help=('Input id for image token.'))
-        parser.add_argument(
-            '--image-input-shape',
-            type=nullable_str,
-            default=None,
-            help=('The biggest image input shape (worst for memory footprint) '
-                  'given an input type. Only used for vLLM\'s profile_run.'))
-        parser.add_argument(
-            '--image-feature-size',
-            type=int,
-            default=None,
-            help=('The image feature size along the context dimension.'))
+        parser = EngineArgs.add_cli_args_for_vlm(parser)
+
        parser.add_argument(
            '--scheduler-delay-factor',
            type=float,
@@ -488,7 +520,6 @@ class EngineArgs:
            default=EngineArgs.speculative_model,
            help=
            'The name of the draft model to be used in speculative decoding.')
-
        parser.add_argument(
            '--num-speculative-tokens',
            type=int,
@@ -666,12 +697,27 @@ class EngineArgs:
                raise ValueError(
                    'Specify `image_token_id`, `image_input_shape` and '
                    '`image_feature_size` together with `image_input_type`.')
+
+            if self.image_processor is None:
+                self.image_processor = self.model
+            if self.disable_image_processor:
+                if self.image_processor != self.model:
+                    warnings.warn(
+                        "You've specified an image processor "
+                        f"({self.image_processor}) but also disabled "
+                        "it via `--disable-image-processor`.",
+                        stacklevel=2)
+
+                self.image_processor = None
+
            vision_language_config = VisionLanguageConfig(
                image_input_type=VisionLanguageConfig.
                get_image_input_enum_type(self.image_input_type),
                image_token_id=self.image_token_id,
                image_input_shape=str_to_int_tuple(self.image_input_shape),
                image_feature_size=self.image_feature_size,
+                image_processor=self.image_processor,
+                image_processor_revision=self.image_processor_revision,
            )
        else:
            vision_language_config = None
@@ -734,3 +780,7 @@ def _engine_args_parser():
 def _async_engine_args_parser():
    return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
                                        async_args_only=True)
+
+
+def _vlm_engine_args_parser():
+    return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser())
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -14,7 +14,6 @@ from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, deprecate_kwargs

@@ -164,7 +163,6 @@ class LLM:
        prompt_token_ids: Optional[List[int]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        ...

@@ -177,7 +175,6 @@ class LLM:
        prompt_token_ids: Optional[List[List[int]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        ...

@@ -191,7 +188,6 @@ class LLM:
        prompt_token_ids: List[int],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        ...

@@ -205,7 +201,6 @@ class LLM:
        prompt_token_ids: List[List[int]],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        ...

@@ -217,7 +212,6 @@ class LLM:
        prompt_token_ids: Union[List[int], List[List[int]]],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        ...

@@ -236,7 +230,6 @@ class LLM:

    @deprecate_kwargs("prompts",
                      "prompt_token_ids",
-                      "multi_modal_data",
                      is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
                      additional_message="Please use the 'inputs' parameter "
                      "instead.")
@@ -249,7 +242,6 @@ class LLM:
        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[RequestOutput]:
        """Generates the completions for the input prompts.

@@ -281,11 +273,10 @@ class LLM:
                "LLM.generate() is only supported for generation models "
                "(XForCausalLM).")

-        if prompt_token_ids is not None or multi_modal_data is not None:
+        if prompt_token_ids is not None:
            inputs = self._convert_v1_inputs(
                prompts=cast(Optional[Union[str, List[str]]], prompts),
                prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
            )
        else:
            inputs = cast(
@@ -314,7 +305,6 @@ class LLM:
        prompt_token_ids: Optional[List[int]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        ...

@@ -327,7 +317,6 @@ class LLM:
        prompt_token_ids: Optional[List[List[int]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        ...

@@ -341,7 +330,6 @@ class LLM:
        prompt_token_ids: List[int],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        ...

@@ -355,7 +343,6 @@ class LLM:
        prompt_token_ids: List[List[int]],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        ...

@@ -367,7 +354,6 @@ class LLM:
        prompt_token_ids: Union[List[int], List[List[int]]],
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        ...

@@ -386,7 +372,6 @@ class LLM:

    @deprecate_kwargs("prompts",
                      "prompt_token_ids",
-                      "multi_modal_data",
                      is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
                      additional_message="Please use the 'inputs' parameter "
                      "instead.")
@@ -399,7 +384,6 @@ class LLM:
        prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
        use_tqdm: bool = True,
        lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
    ) -> List[EmbeddingRequestOutput]:
        """Generates the completions for the input prompts.

@@ -430,11 +414,10 @@ class LLM:
                "LLM.encode() is only supported for embedding models (XModel)."
            )

-        if prompt_token_ids is not None or multi_modal_data is not None:
+        if prompt_token_ids is not None:
            inputs = self._convert_v1_inputs(
                prompts=cast(Optional[Union[str, List[str]]], prompts),
                prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
            )
        else:
            inputs = cast(
@@ -459,7 +442,6 @@ class LLM:
        self,
        prompts: Optional[Union[str, List[str]]],
        prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
-        multi_modal_data: Optional[MultiModalData],
    ):
        # skip_tokenizer_init is now checked in engine

@@ -499,9 +481,6 @@ class LLM:
                else:
                    raise AssertionError

-            if multi_modal_data is not None:
-                item["multi_modal_data"] = multi_modal_data
-
            inputs.append(item)

        return inputs

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput

 from .vlm_base import VisionLanguageModelBase
@@ -82,6 +84,9 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]


+@MULTIMODAL_REGISTRY.register_image_feature_input()
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):

    def __init__(self,
@@ -131,30 +136,41 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
        return data

    def _parse_and_validate_image_input(
-            self, data: object) -> Optional[LlavaImageInputs]:
+            self, **kwargs: object) -> Optional[LlavaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_features = kwargs.pop("image_features", None)
+
        expected_input_type = self.vision_language_config.image_input_type
        ImageInputType = VisionLanguageConfig.ImageInputType

-        if data is None:
+        if expected_input_type == ImageInputType.PIXEL_VALUES:
+            if image_features is not None:
+                raise ValueError(
+                    "Expected pixel values but got image features")
+            if pixel_values is None:
                return None

-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if not isinstance(data, torch.Tensor):
-                raise TypeError("Image pixel vector should be a tensor, "
-                                f"but received type: {type(data)}")
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values")

            return LlavaImagePixelInputs(
                type="pixel_values",
-                data=self._validate_image_data(data),
+                data=self._validate_image_data(pixel_values),
            )
-        elif expected_input_type == ImageInputType.IMAGE_FEATURES:
-            if not isinstance(data, torch.Tensor):
-                raise TypeError("Image feature vector should be a tensor, "
-                                f"but received type: {type(data)}")
+
+        if expected_input_type == ImageInputType.IMAGE_FEATURES:
+            if pixel_values is not None:
+                raise ValueError(
+                    "Expected image features but got pixel values")
+            if image_features is None:
+                return None
+
+            if not isinstance(image_features, torch.Tensor):
+                raise ValueError("Incorrect type of image features")

            return LlavaImageFeatureInputs(
                type="image_features",
-                data=self._validate_image_data(data),
+                data=self._validate_image_data(image_features),
            )

        return None
@@ -201,12 +217,14 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):

        return self.multi_modal_projector(image_features)

-    def forward(self,
+    def forward(
+        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
-                image_input: Optional[torch.Tensor] = None) -> SamplerOutput:
+        **kwargs: object,
+    ) -> SamplerOutput:
        """Run forward pass for Llava 1.5.

        One key thing to understand is the `input_ids` already accounts for the
@@ -239,14 +257,15 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
-            image_input: A batch of image inputs.
-                For PIXEL_VALUES, expecting [1, 3, 336, 336].
-                For IMAGE_FEATURES, expecting [1, 576, 1024].
+            pixel_values: For PIXEL_VALUES, expects a batch with shape
+                [1, 3, 336, 336].
+            image_features: For IMAGE_FEATURES, expects a batch with shape
+                [1, 576, 1024].
        """
-        parsed_image_input = self._parse_and_validate_image_input(image_input)
+        image_input = self._parse_and_validate_image_input(**kwargs)

-        if parsed_image_input is not None:
-            vision_embeddings = self._process_image_input(parsed_image_input)
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
            inputs_embeds = self.language_model.get_input_embeddings(input_ids)

            inputs_embeds = _merge_vision_embeddings(