Unverified Commit 7a64d24a authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Core] Support image processor (#4197)

parent dfbe60dc
...@@ -37,6 +37,7 @@ jobs: ...@@ -37,6 +37,7 @@ jobs:
mypy vllm/distributed --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml
......
...@@ -90,6 +90,7 @@ autodoc_mock_imports = [ ...@@ -90,6 +90,7 @@ autodoc_mock_imports = [
"sentencepiece", "sentencepiece",
"vllm.cuda_utils", "vllm.cuda_utils",
"vllm._C", "vllm._C",
"PIL",
"numpy", "numpy",
"tqdm", "tqdm",
"tensorizer", "tensorizer",
...@@ -116,12 +117,13 @@ class MockedClassDocumenter(autodoc.ClassDocumenter): ...@@ -116,12 +117,13 @@ class MockedClassDocumenter(autodoc.ClassDocumenter):
autodoc.ClassDocumenter = MockedClassDocumenter autodoc.ClassDocumenter = MockedClassDocumenter
intersphinx_mapping = { intersphinx_mapping = {
'python': ('https://docs.python.org/3', None), "python": ("https://docs.python.org/3", None),
'typing_extensions': "typing_extensions":
('https://typing-extensions.readthedocs.io/en/latest', None), ("https://typing-extensions.readthedocs.io/en/latest", None),
'numpy': ('https://numpy.org/doc/stable', None), "pillow": ("https://pillow.readthedocs.io/en/stable", None),
'torch': ('https://pytorch.org/docs/stable', None), "numpy": ("https://numpy.org/doc/stable", None),
'psutil': ('https://psutil.readthedocs.io/en/stable', None), "torch": ("https://pytorch.org/docs/stable", None),
"psutil": ("https://psutil.readthedocs.io/en/stable", None),
} }
autodoc_preserve_defaults = True autodoc_preserve_defaults = True
......
Multi-Modality
==============
.. currentmodule:: vllm.multimodal
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
which allows you to pass in multi-modal input alongside text and token prompts.
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
.. contents::
:local:
:backlinks: none
Module Contents
+++++++++++++++
.. automodule:: vllm.multimodal
Registry
--------
.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
The global :class:`MultiModalRegistry` which is used by model runners.
.. autoclass:: vllm.multimodal.MultiModalRegistry
:members:
:show-inheritance:
Base Classes
------------
.. autoclass:: vllm.multimodal.MultiModalData
:members:
:show-inheritance:
.. autoclass:: vllm.multimodal.MultiModalPlugin
:members:
:show-inheritance:
Image Classes
-------------
.. automodule:: vllm.multimodal.image
:members:
:show-inheritance:
...@@ -88,6 +88,7 @@ Documentation ...@@ -88,6 +88,7 @@ Documentation
models/adding_model models/adding_model
models/engine_args models/engine_args
models/lora models/lora
models/vlm
models/performance models/performance
.. toctree:: .. toctree::
...@@ -99,17 +100,18 @@ Documentation ...@@ -99,17 +100,18 @@ Documentation
quantization/fp8_e4m3_kvcache quantization/fp8_e4m3_kvcache
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 1
:caption: Developer Documentation :caption: Developer Documentation
dev/sampling_params dev/sampling_params
dev/offline_inference/offline_index dev/offline_inference/offline_index
dev/engine/engine_index dev/engine/engine_index
dev/kernel/paged_attention dev/kernel/paged_attention
dev/multimodal/multimodal_index
dev/dockerfile/dockerfile dev/dockerfile/dockerfile
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 1
:caption: Community :caption: Community
community/meetups community/meetups
......
...@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it. ...@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it.
- LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
- :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
- ✅︎ - ✅︎
* - :code:`LlavaForConditionalGeneration`
- LLaVA-1.5
- :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
-
* - :code:`MiniCPMForCausalLM` * - :code:`MiniCPMForCausalLM`
- MiniCPM - MiniCPM
- :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc. - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
......
.. _vlm:
Using VLMs
==========
This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
Engine Arguments
----------------
The following :ref:`engine arguments <engine_args>` are specific to VLMs:
.. argparse::
:module: vllm.engine.arg_utils
:func: _vlm_engine_args_parser
:prog: -m vllm.entrypoints.openai.api_server
:nodefaultconst:
Offline Batched Inference
-------------------------
To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
.. code-block:: python
llm = LLM(
model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values",
image_token_id=32000,
image_input_shape="1,3,336,336",
image_feature_size=576,
)
For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
.. code-block:: python
prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:")
# Load the image using PIL.Image
image = ...
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": ImagePixelData(image),
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
...@@ -3,33 +3,36 @@ import os ...@@ -3,33 +3,36 @@ import os
import subprocess import subprocess
import torch import torch
from PIL import Image
from vllm import LLM from vllm import LLM
from vllm.sequence import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them
def run_llava_pixel_values(): def run_llava_pixel_values(*, disable_image_processor: bool = False):
llm = LLM( llm = LLM(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
image_input_type="pixel_values", image_input_type="pixel_values",
image_token_id=32000, image_token_id=32000,
image_input_shape="1,3,336,336", image_input_shape="1,3,336,336",
image_feature_size=576, image_feature_size=576,
disable_image_processor=disable_image_processor,
) )
prompt = "<image>" * 576 + ( prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:") "\nUSER: What is the content of this image?\nASSISTANT:")
# This should be provided by another online or offline component. if disable_image_processor:
image = torch.load("images/stop_sign_pixel_values.pt") image = torch.load("images/stop_sign_pixel_values.pt")
else:
image = Image.open("images/stop_sign.jpg")
outputs = llm.generate({ outputs = llm.generate({
"prompt": "prompt": prompt,
prompt, "multi_modal_data": ImagePixelData(image),
"multi_modal_data":
MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
}) })
for o in outputs: for o in outputs:
...@@ -49,15 +52,13 @@ def run_llava_image_features(): ...@@ -49,15 +52,13 @@ def run_llava_image_features():
prompt = "<image>" * 576 + ( prompt = "<image>" * 576 + (
"\nUSER: What is the content of this image?\nASSISTANT:") "\nUSER: What is the content of this image?\nASSISTANT:")
# This should be provided by another online or offline component. image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
image = torch.load("images/stop_sign_image_features.pt")
outputs = llm.generate({ outputs = llm.generate({
"prompt": "prompt": prompt,
prompt, "multi_modal_data": ImageFeatureData(image),
"multi_modal_data":
MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
......
...@@ -101,6 +101,7 @@ mypy vllm/core --config-file pyproject.toml ...@@ -101,6 +101,7 @@ mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
mypy vllm/usage --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml
mypy vllm/*.py --config-file pyproject.toml mypy vllm/*.py --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml
......
...@@ -12,6 +12,7 @@ aiohttp ...@@ -12,6 +12,7 @@ aiohttp
openai openai
uvicorn[standard] uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server. pydantic >= 2.0 # Required for OpenAI server.
pillow # Required for image processing
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
......
...@@ -33,8 +33,5 @@ sentence-transformers # required for embedding ...@@ -33,8 +33,5 @@ sentence-transformers # required for embedding
# Benchmarking # Benchmarking
aiohttp aiohttp
# Multimodal
pillow
# quantization # quantization
bitsandbytes==0.42.0 bitsandbytes==0.42.0
...@@ -15,7 +15,9 @@ from vllm.config import TokenizerPoolConfig, VisionLanguageConfig ...@@ -15,7 +15,9 @@ from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
from vllm.distributed import destroy_model_parallel from vllm.distributed import destroy_model_parallel
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import MultiModalData, SampleLogprobs from vllm.multimodal import MultiModalData
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
from vllm.sequence import SampleLogprobs
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -24,6 +26,7 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] ...@@ -24,6 +26,7 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
# Multi modal related # Multi modal related
# You can use `.buildkite/download-images.sh` to download the assets
_PIXEL_VALUES_FILES = [ _PIXEL_VALUES_FILES = [
os.path.join(_TEST_DIR, "images", filename) for filename in os.path.join(_TEST_DIR, "images", filename) for filename in
["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"] ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
...@@ -89,17 +92,23 @@ def hf_images() -> List[Image.Image]: ...@@ -89,17 +92,23 @@ def hf_images() -> List[Image.Image]:
@pytest.fixture() @pytest.fixture()
def vllm_images(request) -> "torch.Tensor": def vllm_images(request) -> List[MultiModalData]:
vision_language_config = request.getfixturevalue("model_and_config")[1] vision_language_config = request.getfixturevalue("model_and_config")[1]
all_images = []
if vision_language_config.image_input_type == ( if vision_language_config.image_input_type == (
VisionLanguageConfig.ImageInputType.IMAGE_FEATURES): VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
filenames = _IMAGE_FEATURES_FILES return [
ImageFeatureData(torch.load(filename))
for filename in _IMAGE_FEATURES_FILES
]
else: else:
filenames = _PIXEL_VALUES_FILES return [
for filename in filenames: ImagePixelData(Image.open(filename)) for filename in _IMAGE_FILES
all_images.append(torch.load(filename)) ]
return torch.concat(all_images, dim=0)
@pytest.fixture()
def vllm_image_tensors(request) -> List[torch.Tensor]:
return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]
@pytest.fixture() @pytest.fixture()
...@@ -392,23 +401,17 @@ class VllmRunner: ...@@ -392,23 +401,17 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
sampling_params: SamplingParams, sampling_params: SamplingParams,
images: Optional[torch.Tensor] = None, images: Optional[List[MultiModalData]] = None,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> List[Tuple[List[List[int]], List[str]]]:
if images is not None: if images is not None:
assert len(prompts) == len(images) assert len(prompts) == len(images)
prompt_inputs: List[TextPrompt] = [] inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
for i, prompt in enumerate(prompts): if images is not None:
prompt = TextPrompt(prompt=prompt) for i, image in enumerate(images):
if images is not None: inputs[i]["multi_modal_data"] = image
prompt["multi_modal_data"] = MultiModalData(
type=MultiModalData.Type.IMAGE,
data=images[i:i + 1],
)
prompt_inputs.append(prompt)
req_outputs = self.model.generate(prompt_inputs, req_outputs = self.model.generate(inputs,
sampling_params=sampling_params) sampling_params=sampling_params)
outputs: List[Tuple[List[List[int]], List[str]]] = [] outputs: List[Tuple[List[List[int]], List[str]]] = []
...@@ -447,7 +450,7 @@ class VllmRunner: ...@@ -447,7 +450,7 @@ class VllmRunner:
self, self,
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[torch.Tensor] = None, images: Optional[List[MultiModalData]] = None,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params, images=images) outputs = self.generate(prompts, greedy_params, images=images)
......
import gc import gc
from dataclasses import fields from dataclasses import fields
from enum import Enum from enum import Enum
from typing import Dict, List, Tuple from typing import Any, Dict, List, Tuple
import pytest import pytest
import torch import torch
...@@ -9,36 +9,50 @@ from transformers import AutoTokenizer ...@@ -9,36 +9,50 @@ from transformers import AutoTokenizer
from vllm.config import VisionLanguageConfig from vllm.config import VisionLanguageConfig
def iter_llava_configs(model_name: str):
image_hw_to_feature_size = {
(336, 336): 576,
}
for (h, w), f in image_hw_to_feature_size.items():
for input_type, input_shape in [
(VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
(VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
]:
yield (model_name,
VisionLanguageConfig(image_input_type=input_type,
image_feature_size=f,
image_token_id=32000,
image_input_shape=input_shape,
image_processor=model_name,
image_processor_revision=None))
model_and_vl_config = [ model_and_vl_config = [
("llava-hf/llava-1.5-7b-hf", *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
VisionLanguageConfig( # Not enough memory
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES, # *iter_llava_configs("llava-hf/llava-1.5-13b-hf"),
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 3, 336, 336))),
("llava-hf/llava-1.5-7b-hf",
VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
image_feature_size=576,
image_token_id=32000,
image_input_shape=(1, 576, 1024)))
] ]
def as_dict(vision_language_config: VisionLanguageConfig) -> Dict: def as_dict(vlm_config: VisionLanguageConfig) -> Dict[str, Any]:
"""Flatten vision language config to pure args. """Flatten vision language config to pure args.
Compatible with what llm entrypoint expects. Compatible with what llm entrypoint expects.
""" """
result = {} result = {}
for field in fields(vision_language_config): for field in fields(vlm_config):
value = getattr(vision_language_config, field.name) value = getattr(vlm_config, field.name)
if isinstance(value, Enum): if isinstance(value, Enum):
result[field.name] = value.name.lower() result[field.name] = value.name.lower()
elif isinstance(value, tuple): elif isinstance(value, tuple):
result[field.name] = ",".join([str(item) for item in value]) result[field.name] = ",".join([str(item) for item in value])
else: else:
result[field.name] = value result[field.name] = value
result["disable_image_processor"] = vlm_config.image_processor is None
return result return result
...@@ -67,18 +81,19 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str], ...@@ -67,18 +81,19 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images, def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
vllm_image_prompts, vllm_images, model_and_config: tuple, vllm_image_prompts, vllm_images, model_and_config, dtype: str,
dtype: str, max_tokens: int, worker_use_ray: bool) -> None: max_tokens: int, worker_use_ray: bool) -> None:
"""Inference result should be the same between hf and vllm. """Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images. All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the raw images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide image tensors and corresponding For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input. vision language config as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
""" """
model_id, vision_language_config = model_and_config model_id, vision_language_config = model_and_config
hf_model = hf_runner(model_id, dtype=dtype) hf_model = hf_runner(model_id, dtype=dtype)
hf_outputs = hf_model.generate_greedy(hf_image_prompts, hf_outputs = hf_model.generate_greedy(hf_image_prompts,
max_tokens, max_tokens,
...@@ -88,6 +103,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images, ...@@ -88,6 +103,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
vllm_model = vllm_runner(model_id, vllm_model = vllm_runner(model_id,
dtype=dtype, dtype=dtype,
worker_use_ray=worker_use_ray, worker_use_ray=worker_use_ray,
enforce_eager=True,
**as_dict(vision_language_config)) **as_dict(vision_language_config))
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
max_tokens, max_tokens,
...@@ -105,3 +121,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images, ...@@ -105,3 +121,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
assert hf_output_ids == vllm_output_ids, ( assert hf_output_ids == vllm_output_ids, (
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
# (Requires multiple GPUs)
import numpy as np
import pytest
from transformers import CLIPImageProcessor
from vllm.config import ModelConfig, VisionLanguageConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import ImagePixelData
@pytest.mark.parametrize("dtype", ["half", "bfloat16", "float"])
def test_clip_image_processor(hf_images, dtype):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT = IMAGE_WIDTH = 33
hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, CLIPImageProcessor)
model_config = ModelConfig(
model=MODEL_NAME,
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
)
vlm_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_token_id=32000,
image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
image_feature_size=576,
image_processor=MODEL_NAME,
image_processor_revision=None,
)
for image in hf_images:
hf_result = hf_processor.preprocess(
image,
return_tensors="np",
)
vllm_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(image),
model_config=model_config,
vlm_config=vlm_config,
)
assert hf_result.keys() == vllm_result.keys()
for key, hf_arr in hf_result.items():
vllm_arr: np.ndarray = vllm_result[key].numpy()
assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
@pytest.mark.parametrize("dtype", ["float"])
def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
IMAGE_HEIGHT = IMAGE_WIDTH = 33
model_config = ModelConfig(
model=MODEL_NAME,
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype=dtype,
revision=None,
)
vlm_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
image_token_id=32000,
image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
image_feature_size=576,
image_processor=MODEL_NAME,
image_processor_revision=None,
)
for image, tensor in zip(hf_images, vllm_image_tensors):
image_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(image),
model_config=model_config,
vlm_config=vlm_config,
)
tensor_result = MULTIMODAL_REGISTRY.process_input(
ImagePixelData(tensor),
model_config=model_config,
vlm_config=vlm_config,
)
assert image_result.keys() == tensor_result.keys()
for key, image_arr in image_result.items():
tensor_arr: np.ndarray = tensor_result[key].numpy()
assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
# The examples in PR#3042 have slightly different preprocessing from
# HuggingFace's LlavaProcessor, causing the test to fail.
# assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
...@@ -18,9 +18,10 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -18,9 +18,10 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from vllm.multimodal import MultiModalData
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, MultiModalData from vllm.sequence import Logprob
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, random_uuid from vllm.utils import Counter, random_uuid
......
import pytest
from transformers.image_processing_utils import BaseImageProcessor
from vllm.transformers_utils.image_processor import get_image_processor
IMAGE_PROCESSOR_NAMES = [
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-34b-hf",
]
@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
def test_image_processor_revision(processor_name: str):
# Assume that "main" branch always exists
image_processor = get_image_processor(processor_name, revision="main")
assert isinstance(image_processor, BaseImageProcessor)
# Assume that "never" branch always does not exist
with pytest.raises(OSError, match='not a valid git identifier'):
get_image_processor(processor_name, revision="never")
...@@ -1094,10 +1094,12 @@ class VisionLanguageConfig: ...@@ -1094,10 +1094,12 @@ class VisionLanguageConfig:
# worst case scenario (biggest supported resolution). # worst case scenario (biggest supported resolution).
image_input_shape: tuple image_input_shape: tuple
image_feature_size: int image_feature_size: int
# The image processor to load from HuggingFace
image_processor: Optional[str]
image_processor_revision: Optional[str]
@classmethod @classmethod
def get_image_input_enum_type( def get_image_input_enum_type(cls, value: str) -> ImageInputType:
cls, value: str) -> "VisionLanguageConfig.ImageInputType":
"""Get the image input type from a string.""" """Get the image input type from a string."""
try: try:
return cls.ImageInputType[value.upper()] return cls.ImageInputType[value.upper()]
......
import argparse import argparse
import dataclasses import dataclasses
import json import json
import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
...@@ -80,6 +81,10 @@ class EngineArgs: ...@@ -80,6 +81,10 @@ class EngineArgs:
image_token_id: Optional[int] = None image_token_id: Optional[int] = None
image_input_shape: Optional[str] = None image_input_shape: Optional[str] = None
image_feature_size: Optional[int] = None image_feature_size: Optional[int] = None
image_processor: Optional[str] = None
image_processor_revision: Optional[str] = None
disable_image_processor: bool = False
scheduler_delay_factor: float = 0.0 scheduler_delay_factor: float = 0.0
enable_chunked_prefill: bool = False enable_chunked_prefill: bool = False
...@@ -98,6 +103,53 @@ class EngineArgs: ...@@ -98,6 +103,53 @@ class EngineArgs:
if self.tokenizer is None: if self.tokenizer is None:
self.tokenizer = self.model self.tokenizer = self.model
@staticmethod
def add_cli_args_for_vlm(
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
parser.add_argument('--image-input-type',
type=nullable_str,
default=None,
choices=[
t.name.lower()
for t in VisionLanguageConfig.ImageInputType
],
help=('The image input type passed into vLLM.'))
parser.add_argument('--image-token-id',
type=int,
default=None,
help=('Input id for image token.'))
parser.add_argument(
'--image-input-shape',
type=nullable_str,
default=None,
help=('The biggest image input shape (worst for memory footprint) '
'given an input type. Only used for vLLM\'s profile_run.'))
parser.add_argument(
'--image-feature-size',
type=int,
default=None,
help=('The image feature size along the context dimension.'))
parser.add_argument(
'--image-processor',
type=str,
default=EngineArgs.image_processor,
help='Name or path of the huggingface image processor to use. '
'If unspecified, model name or path will be used.')
parser.add_argument(
'--image-processor-revision',
type=str,
default=None,
help='Revision of the huggingface image processor version to use. '
'It can be a branch name, a tag name, or a commit id. '
'If unspecified, will use the default version.')
parser.add_argument(
'--disable-image-processor',
action='store_true',
help='Disables the use of image processor, even if one is defined '
'for the model on huggingface.')
return parser
@staticmethod @staticmethod
def add_cli_args( def add_cli_args(
parser: argparse.ArgumentParser) -> argparse.ArgumentParser: parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
...@@ -113,7 +165,8 @@ class EngineArgs: ...@@ -113,7 +165,8 @@ class EngineArgs:
'--tokenizer', '--tokenizer',
type=nullable_str, type=nullable_str,
default=EngineArgs.tokenizer, default=EngineArgs.tokenizer,
help='Name or path of the huggingface tokenizer to use.') help='Name or path of the huggingface tokenizer to use. '
'If unspecified, model name or path will be used.')
parser.add_argument( parser.add_argument(
'--skip-tokenizer-init', '--skip-tokenizer-init',
action='store_true', action='store_true',
...@@ -136,9 +189,9 @@ class EngineArgs: ...@@ -136,9 +189,9 @@ class EngineArgs:
'--tokenizer-revision', '--tokenizer-revision',
type=nullable_str, type=nullable_str,
default=None, default=None,
help='The specific tokenizer version to use. It can be a branch ' help='Revision of the huggingface tokenizer to use. '
'name, a tag name, or a commit id. If unspecified, will use ' 'It can be a branch name, a tag name, or a commit id. '
'the default version.') 'If unspecified, will use the default version.')
parser.add_argument( parser.add_argument(
'--tokenizer-mode', '--tokenizer-mode',
type=str, type=str,
...@@ -445,31 +498,10 @@ class EngineArgs: ...@@ -445,31 +498,10 @@ class EngineArgs:
default=EngineArgs.device, default=EngineArgs.device,
choices=["auto", "cuda", "neuron", "cpu"], choices=["auto", "cuda", "neuron", "cpu"],
help='Device type for vLLM execution.') help='Device type for vLLM execution.')
# Related to Vision-language models such as llava # Related to Vision-language models such as llava
parser.add_argument( parser = EngineArgs.add_cli_args_for_vlm(parser)
'--image-input-type',
type=nullable_str,
default=None,
choices=[
t.name.lower() for t in VisionLanguageConfig.ImageInputType
],
help=('The image input type passed into vLLM. '
'Should be one of "pixel_values" or "image_features".'))
parser.add_argument('--image-token-id',
type=int,
default=None,
help=('Input id for image token.'))
parser.add_argument(
'--image-input-shape',
type=nullable_str,
default=None,
help=('The biggest image input shape (worst for memory footprint) '
'given an input type. Only used for vLLM\'s profile_run.'))
parser.add_argument(
'--image-feature-size',
type=int,
default=None,
help=('The image feature size along the context dimension.'))
parser.add_argument( parser.add_argument(
'--scheduler-delay-factor', '--scheduler-delay-factor',
type=float, type=float,
...@@ -488,7 +520,6 @@ class EngineArgs: ...@@ -488,7 +520,6 @@ class EngineArgs:
default=EngineArgs.speculative_model, default=EngineArgs.speculative_model,
help= help=
'The name of the draft model to be used in speculative decoding.') 'The name of the draft model to be used in speculative decoding.')
parser.add_argument( parser.add_argument(
'--num-speculative-tokens', '--num-speculative-tokens',
type=int, type=int,
...@@ -666,12 +697,27 @@ class EngineArgs: ...@@ -666,12 +697,27 @@ class EngineArgs:
raise ValueError( raise ValueError(
'Specify `image_token_id`, `image_input_shape` and ' 'Specify `image_token_id`, `image_input_shape` and '
'`image_feature_size` together with `image_input_type`.') '`image_feature_size` together with `image_input_type`.')
if self.image_processor is None:
self.image_processor = self.model
if self.disable_image_processor:
if self.image_processor != self.model:
warnings.warn(
"You've specified an image processor "
f"({self.image_processor}) but also disabled "
"it via `--disable-image-processor`.",
stacklevel=2)
self.image_processor = None
vision_language_config = VisionLanguageConfig( vision_language_config = VisionLanguageConfig(
image_input_type=VisionLanguageConfig. image_input_type=VisionLanguageConfig.
get_image_input_enum_type(self.image_input_type), get_image_input_enum_type(self.image_input_type),
image_token_id=self.image_token_id, image_token_id=self.image_token_id,
image_input_shape=str_to_int_tuple(self.image_input_shape), image_input_shape=str_to_int_tuple(self.image_input_shape),
image_feature_size=self.image_feature_size, image_feature_size=self.image_feature_size,
image_processor=self.image_processor,
image_processor_revision=self.image_processor_revision,
) )
else: else:
vision_language_config = None vision_language_config = None
...@@ -734,3 +780,7 @@ def _engine_args_parser(): ...@@ -734,3 +780,7 @@ def _engine_args_parser():
def _async_engine_args_parser(): def _async_engine_args_parser():
return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(), return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
async_args_only=True) async_args_only=True)
def _vlm_engine_args_parser():
return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser())
...@@ -14,7 +14,6 @@ from vllm.lora.request import LoRARequest ...@@ -14,7 +14,6 @@ from vllm.lora.request import LoRARequest
from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.outputs import EmbeddingRequestOutput, RequestOutput
from vllm.pooling_params import PoolingParams from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import MultiModalData
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import Counter, deprecate_kwargs from vllm.utils import Counter, deprecate_kwargs
...@@ -164,7 +163,6 @@ class LLM: ...@@ -164,7 +163,6 @@ class LLM:
prompt_token_ids: Optional[List[int]] = None, prompt_token_ids: Optional[List[int]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
... ...
...@@ -177,7 +175,6 @@ class LLM: ...@@ -177,7 +175,6 @@ class LLM:
prompt_token_ids: Optional[List[List[int]]] = None, prompt_token_ids: Optional[List[List[int]]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
... ...
...@@ -191,7 +188,6 @@ class LLM: ...@@ -191,7 +188,6 @@ class LLM:
prompt_token_ids: List[int], prompt_token_ids: List[int],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
... ...
...@@ -205,7 +201,6 @@ class LLM: ...@@ -205,7 +201,6 @@ class LLM:
prompt_token_ids: List[List[int]], prompt_token_ids: List[List[int]],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
... ...
...@@ -217,7 +212,6 @@ class LLM: ...@@ -217,7 +212,6 @@ class LLM:
prompt_token_ids: Union[List[int], List[List[int]]], prompt_token_ids: Union[List[int], List[List[int]]],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
... ...
...@@ -236,7 +230,6 @@ class LLM: ...@@ -236,7 +230,6 @@ class LLM:
@deprecate_kwargs("prompts", @deprecate_kwargs("prompts",
"prompt_token_ids", "prompt_token_ids",
"multi_modal_data",
is_deprecated=lambda: LLM.DEPRECATE_LEGACY, is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
additional_message="Please use the 'inputs' parameter " additional_message="Please use the 'inputs' parameter "
"instead.") "instead.")
...@@ -249,7 +242,6 @@ class LLM: ...@@ -249,7 +242,6 @@ class LLM:
prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[RequestOutput]: ) -> List[RequestOutput]:
"""Generates the completions for the input prompts. """Generates the completions for the input prompts.
...@@ -281,11 +273,10 @@ class LLM: ...@@ -281,11 +273,10 @@ class LLM:
"LLM.generate() is only supported for generation models " "LLM.generate() is only supported for generation models "
"(XForCausalLM).") "(XForCausalLM).")
if prompt_token_ids is not None or multi_modal_data is not None: if prompt_token_ids is not None:
inputs = self._convert_v1_inputs( inputs = self._convert_v1_inputs(
prompts=cast(Optional[Union[str, List[str]]], prompts), prompts=cast(Optional[Union[str, List[str]]], prompts),
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
multi_modal_data=multi_modal_data,
) )
else: else:
inputs = cast( inputs = cast(
...@@ -314,7 +305,6 @@ class LLM: ...@@ -314,7 +305,6 @@ class LLM:
prompt_token_ids: Optional[List[int]] = None, prompt_token_ids: Optional[List[int]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
... ...
...@@ -327,7 +317,6 @@ class LLM: ...@@ -327,7 +317,6 @@ class LLM:
prompt_token_ids: Optional[List[List[int]]] = None, prompt_token_ids: Optional[List[List[int]]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
... ...
...@@ -341,7 +330,6 @@ class LLM: ...@@ -341,7 +330,6 @@ class LLM:
prompt_token_ids: List[int], prompt_token_ids: List[int],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
... ...
...@@ -355,7 +343,6 @@ class LLM: ...@@ -355,7 +343,6 @@ class LLM:
prompt_token_ids: List[List[int]], prompt_token_ids: List[List[int]],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
... ...
...@@ -367,7 +354,6 @@ class LLM: ...@@ -367,7 +354,6 @@ class LLM:
prompt_token_ids: Union[List[int], List[List[int]]], prompt_token_ids: Union[List[int], List[List[int]]],
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
... ...
...@@ -386,7 +372,6 @@ class LLM: ...@@ -386,7 +372,6 @@ class LLM:
@deprecate_kwargs("prompts", @deprecate_kwargs("prompts",
"prompt_token_ids", "prompt_token_ids",
"multi_modal_data",
is_deprecated=lambda: LLM.DEPRECATE_LEGACY, is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
additional_message="Please use the 'inputs' parameter " additional_message="Please use the 'inputs' parameter "
"instead.") "instead.")
...@@ -399,7 +384,6 @@ class LLM: ...@@ -399,7 +384,6 @@ class LLM:
prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None, prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
use_tqdm: bool = True, use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
multi_modal_data: Optional[MultiModalData] = None,
) -> List[EmbeddingRequestOutput]: ) -> List[EmbeddingRequestOutput]:
"""Generates the completions for the input prompts. """Generates the completions for the input prompts.
...@@ -430,11 +414,10 @@ class LLM: ...@@ -430,11 +414,10 @@ class LLM:
"LLM.encode() is only supported for embedding models (XModel)." "LLM.encode() is only supported for embedding models (XModel)."
) )
if prompt_token_ids is not None or multi_modal_data is not None: if prompt_token_ids is not None:
inputs = self._convert_v1_inputs( inputs = self._convert_v1_inputs(
prompts=cast(Optional[Union[str, List[str]]], prompts), prompts=cast(Optional[Union[str, List[str]]], prompts),
prompt_token_ids=prompt_token_ids, prompt_token_ids=prompt_token_ids,
multi_modal_data=multi_modal_data,
) )
else: else:
inputs = cast( inputs = cast(
...@@ -459,7 +442,6 @@ class LLM: ...@@ -459,7 +442,6 @@ class LLM:
self, self,
prompts: Optional[Union[str, List[str]]], prompts: Optional[Union[str, List[str]]],
prompt_token_ids: Optional[Union[List[int], List[List[int]]]], prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
multi_modal_data: Optional[MultiModalData],
): ):
# skip_tokenizer_init is now checked in engine # skip_tokenizer_init is now checked in engine
...@@ -499,9 +481,6 @@ class LLM: ...@@ -499,9 +481,6 @@ class LLM:
else: else:
raise AssertionError raise AssertionError
if multi_modal_data is not None:
item["multi_modal_data"] = multi_modal_data
inputs.append(item) inputs.append(item)
return inputs return inputs
......
...@@ -17,6 +17,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead ...@@ -17,6 +17,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import get_dummy_image_data
from vllm.sequence import SamplerOutput from vllm.sequence import SamplerOutput
from .vlm_base import VisionLanguageModelBase from .vlm_base import VisionLanguageModelBase
...@@ -82,6 +84,9 @@ class LlavaImageFeatureInputs(TypedDict): ...@@ -82,6 +84,9 @@ class LlavaImageFeatureInputs(TypedDict):
LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs] LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
@MULTIMODAL_REGISTRY.register_image_feature_input()
@MULTIMODAL_REGISTRY.register_image_pixel_input()
@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
class LlavaForConditionalGeneration(VisionLanguageModelBase): class LlavaForConditionalGeneration(VisionLanguageModelBase):
def __init__(self, def __init__(self,
...@@ -131,30 +136,41 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): ...@@ -131,30 +136,41 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
return data return data
def _parse_and_validate_image_input( def _parse_and_validate_image_input(
self, data: object) -> Optional[LlavaImageInputs]: self, **kwargs: object) -> Optional[LlavaImageInputs]:
pixel_values = kwargs.pop("pixel_values", None)
image_features = kwargs.pop("image_features", None)
expected_input_type = self.vision_language_config.image_input_type expected_input_type = self.vision_language_config.image_input_type
ImageInputType = VisionLanguageConfig.ImageInputType ImageInputType = VisionLanguageConfig.ImageInputType
if data is None:
return None
if expected_input_type == ImageInputType.PIXEL_VALUES: if expected_input_type == ImageInputType.PIXEL_VALUES:
if not isinstance(data, torch.Tensor): if image_features is not None:
raise TypeError("Image pixel vector should be a tensor, " raise ValueError(
f"but received type: {type(data)}") "Expected pixel values but got image features")
if pixel_values is None:
return None
if not isinstance(pixel_values, torch.Tensor):
raise ValueError("Incorrect type of pixel values")
return LlavaImagePixelInputs( return LlavaImagePixelInputs(
type="pixel_values", type="pixel_values",
data=self._validate_image_data(data), data=self._validate_image_data(pixel_values),
) )
elif expected_input_type == ImageInputType.IMAGE_FEATURES:
if not isinstance(data, torch.Tensor): if expected_input_type == ImageInputType.IMAGE_FEATURES:
raise TypeError("Image feature vector should be a tensor, " if pixel_values is not None:
f"but received type: {type(data)}") raise ValueError(
"Expected image features but got pixel values")
if image_features is None:
return None
if not isinstance(image_features, torch.Tensor):
raise ValueError("Incorrect type of image features")
return LlavaImageFeatureInputs( return LlavaImageFeatureInputs(
type="image_features", type="image_features",
data=self._validate_image_data(data), data=self._validate_image_data(image_features),
) )
return None return None
...@@ -201,12 +217,14 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): ...@@ -201,12 +217,14 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
return self.multi_modal_projector(image_features) return self.multi_modal_projector(image_features)
def forward(self, def forward(
input_ids: torch.Tensor, self,
positions: torch.Tensor, input_ids: torch.Tensor,
kv_caches: List[torch.Tensor], positions: torch.Tensor,
attn_metadata: AttentionMetadata, kv_caches: List[torch.Tensor],
image_input: Optional[torch.Tensor] = None) -> SamplerOutput: attn_metadata: AttentionMetadata,
**kwargs: object,
) -> SamplerOutput:
"""Run forward pass for Llava 1.5. """Run forward pass for Llava 1.5.
One key thing to understand is the `input_ids` already accounts for the One key thing to understand is the `input_ids` already accounts for the
...@@ -227,10 +245,10 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): ...@@ -227,10 +245,10 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
This way, the `positions` and `attn_metadata` are consistent This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`. with the `input_ids`.
The model takes two types of image inputs: The model takes two types of image inputs:
PIXEL_VALUES and IMAGE_FEATURES. PIXEL_VALUES and IMAGE_FEATURES.
The following shows how each maps to huggingface implementation. The following shows how each maps to huggingface implementation.
PIXEL_VALUES: PIXEL_VALUES:
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353 - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
IMAGE_FEATURES: IMAGE_FEATURES:
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430 - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
...@@ -239,14 +257,15 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase): ...@@ -239,14 +257,15 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
Args: Args:
input_ids: Flattened (concatenated) input_ids corresponding to a input_ids: Flattened (concatenated) input_ids corresponding to a
batch. batch.
image_input: A batch of image inputs. pixel_values: For PIXEL_VALUES, expects a batch with shape
For PIXEL_VALUES, expecting [1, 3, 336, 336]. [1, 3, 336, 336].
For IMAGE_FEATURES, expecting [1, 576, 1024]. image_features: For IMAGE_FEATURES, expects a batch with shape
[1, 576, 1024].
""" """
parsed_image_input = self._parse_and_validate_image_input(image_input) image_input = self._parse_and_validate_image_input(**kwargs)
if parsed_image_input is not None: if image_input is not None:
vision_embeddings = self._process_image_input(parsed_image_input) vision_embeddings = self._process_image_input(image_input)
inputs_embeds = self.language_model.get_input_embeddings(input_ids) inputs_embeds = self.language_model.get_input_embeddings(input_ids)
inputs_embeds = _merge_vision_embeddings( inputs_embeds = _merge_vision_embeddings(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment