Unverified Commit 9e169a4c authored by Alphi's avatar Alphi Committed by GitHub
Browse files

[Model] Adding support for MiniCPM-V (#4087)

parent 5689e256
...@@ -40,6 +40,8 @@ Registry ...@@ -40,6 +40,8 @@ Registry
Base Classes Base Classes
------------ ------------
.. autodata:: vllm.multimodal.NestedTensors
.. autodata:: vllm.multimodal.BatchedTensors .. autodata:: vllm.multimodal.BatchedTensors
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
......
...@@ -206,6 +206,10 @@ Vision Language Models ...@@ -206,6 +206,10 @@ Vision Language Models
- Phi-3-Vision - Phi-3-Vision
- :code:`microsoft/Phi-3-vision-128k-instruct`, etc. - :code:`microsoft/Phi-3-vision-128k-instruct`, etc.
- -
* - :code:`MiniCPM-V`
- MiniCPM-V
- :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
-
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`
......
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
# 2.0
# MODEL_NAME = "HwwwH/MiniCPM-V-2"
# 2.5
MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
image = ImageAsset("stop_sign").pil_image.convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
llm = LLM(model=MODEL_NAME,
gpu_memory_utilization=1,
trust_remote_code=True,
max_model_len=4096)
messages = [{
'role':
'user',
'content':
'(<image>./</image>)\n' + "What's the content of the image?"
}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# 2.0
# stop_token_ids = [tokenizer.eos_id]
# 2.5
stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
sampling_params = SamplingParams(
stop_token_ids=stop_token_ids,
# temperature=0.7,
# top_p=0.8,
# top_k=100,
# seed=3472,
max_tokens=1024,
# min_tokens=150,
temperature=0,
use_beam_search=True,
# length_penalty=1.2,
best_of=3)
outputs = llm.generate({
"prompt": prompt,
"multi_modal_data": {
"image": image
}
},
sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
...@@ -11,7 +11,7 @@ import torch.nn as nn ...@@ -11,7 +11,7 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from PIL import Image from PIL import Image
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq, from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
AutoTokenizer, BatchEncoding) AutoTokenizer, BatchEncoding, BatchFeature)
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
...@@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets: ...@@ -133,7 +133,7 @@ def image_assets() -> _ImageAssets:
return IMAGE_ASSETS return IMAGE_ASSETS
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding) _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature)
class HfRunner: class HfRunner:
...@@ -339,7 +339,6 @@ class HfRunner: ...@@ -339,7 +339,6 @@ class HfRunner:
processor_kwargs["images"] = images[i] processor_kwargs["images"] = images[i]
inputs = self.processor(**processor_kwargs) inputs = self.processor(**processor_kwargs)
input_ids = inputs.input_ids
output = self.model.generate( output = self.model.generate(
**self.wrap_device(inputs), **self.wrap_device(inputs),
...@@ -381,7 +380,7 @@ class HfRunner: ...@@ -381,7 +380,7 @@ class HfRunner:
all_logprobs.append(seq_logprobs_lst) all_logprobs.append(seq_logprobs_lst)
seq_ids = output.sequences[0] seq_ids = output.sequences[0]
output_len = seq_ids.shape[0] - input_ids.shape[1] output_len = len(seq_logprobs_lst)
output_ids = seq_ids[-output_len:] output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist()) all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids)) all_output_strs.append(self.tokenizer.decode(output_ids))
...@@ -514,10 +513,12 @@ class VllmRunner: ...@@ -514,10 +513,12 @@ class VllmRunner:
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
images: Optional[List[Image.Image]] = None, images: Optional[List[Image.Image]] = None,
stop_token_ids: Optional[List[int]] = None,
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
greedy_logprobs_params = SamplingParams(temperature=0.0, greedy_logprobs_params = SamplingParams(temperature=0.0,
max_tokens=max_tokens, max_tokens=max_tokens,
logprobs=num_logprobs) logprobs=num_logprobs,
stop_token_ids=stop_token_ids)
outputs = self.generate_w_logprobs(prompts, outputs = self.generate_w_logprobs(prompts,
greedy_logprobs_params, greedy_logprobs_params,
images=images) images=images)
......
from collections import UserDict
from typing import List, Optional, Tuple, Type
import pytest
import torch
import torch.types
from transformers import BatchFeature
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close
pytestmark = pytest.mark.vlm
# The image token is placed before "user" on purpose so that the test can pass
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
"(<image>./</image>)\nWhat's the content of the image?<|eot_id|>" \
"<|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
"cherry_blossom":
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" \
"(<image>./</image>)\nWhat is the season?<|eot_id|>" \
"<|start_header_id|>assistant<|end_header_id|>\n\n"
})
models = ["openbmb/MiniCPM-Llama3-V-2_5"]
def trunc_hf_output(hf_output: Tuple[List[int], str,
Optional[SampleLogprobs]]):
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
target_dtype = "half"
def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
max_model_len=4096,
max_num_seqs=1,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=vllm_images,
stop_token_ids=stop_token_ids)
for prompts, vllm_images in inputs_per_image
]
with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
class NestedInputs(UserDict):
def __init__(self, model_inputs: BatchFeature):
super().__init__({"model_inputs": model_inputs})
self.model_inputs = model_inputs
def to(self, device: torch.types.Device):
return NestedInputs(self.model_inputs.to(device))
hf_processor = hf_model.processor
hf_model.processor = lambda **kw: NestedInputs(
hf_processor(**kw) # type: ignore
)
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images,
tokenizer=tokenizer)
for prompts, hf_images in inputs_per_image
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
check_logprobs_close(
outputs_0_lst=[
trunc_hf_output(hf_output) for hf_output in hf_outputs
],
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
...@@ -50,6 +50,7 @@ _GENERATION_MODELS = { ...@@ -50,6 +50,7 @@ _GENERATION_MODELS = {
"MptForCausalLM": ("mpt", "MPTForCausalLM"), "MptForCausalLM": ("mpt", "MPTForCausalLM"),
"MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
"MiniCPMV": ("minicpmv", "MiniCPMV"),
"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
"OPTForCausalLM": ("opt", "OPTForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"),
"OrionForCausalLM": ("orion", "OrionForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"),
......
...@@ -418,9 +418,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA): ...@@ -418,9 +418,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
input_embeds: Optional[torch.Tensor] = None
) -> Union[torch.Tensor, IntermediateTensors]: ) -> Union[torch.Tensor, IntermediateTensors]:
model_output = self.model(input_ids, positions, kv_caches, model_output = self.model(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors) attn_metadata, intermediate_tensors,
input_embeds)
return model_output return model_output
def compute_logits(self, hidden_states: torch.Tensor, def compute_logits(self, hidden_states: torch.Tensor,
......
...@@ -463,10 +463,11 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA): ...@@ -463,10 +463,11 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
positions: torch.Tensor, positions: torch.Tensor,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
input_embeds: Optional[torch.Tensor] = None,
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
) -> torch.Tensor: ) -> torch.Tensor:
hidden_states = self.model(input_ids, positions, kv_caches, hidden_states = self.model(input_ids, positions, kv_caches,
attn_metadata) attn_metadata, input_embeds)
return hidden_states return hidden_states
def compute_logits(self, hidden_states: torch.Tensor, def compute_logits(self, hidden_states: torch.Tensor,
......
This diff is collapsed.
from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict, from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
MultiModalInputs, MultiModalPlugin) MultiModalInputs, MultiModalPlugin, NestedTensors)
from .registry import MultiModalRegistry from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry() MULTIMODAL_REGISTRY = MultiModalRegistry()
...@@ -17,6 +17,7 @@ __all__ = [ ...@@ -17,6 +17,7 @@ __all__ = [
"MultiModalDataDict", "MultiModalDataDict",
"MultiModalInputs", "MultiModalInputs",
"MultiModalPlugin", "MultiModalPlugin",
"NestedTensors",
"MULTIMODAL_REGISTRY", "MULTIMODAL_REGISTRY",
"MultiModalRegistry", "MultiModalRegistry",
] ]
...@@ -2,7 +2,7 @@ import sys ...@@ -2,7 +2,7 @@ import sys
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections import UserDict, defaultdict from collections import UserDict, defaultdict
from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict, from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
TypeVar, Union) TypeVar, Union, cast)
import torch import torch
import torch.types import torch.types
...@@ -15,10 +15,17 @@ from vllm.logger import init_logger ...@@ -15,10 +15,17 @@ from vllm.logger import init_logger
logger = init_logger(__name__) logger = init_logger(__name__)
BatchedTensors = Union[torch.Tensor, List[torch.Tensor]] NestedTensors = Union[List[torch.Tensor], torch.Tensor]
"""
Use a list instead of a tensor if the dimensions of each element do not match.
Currently only supports up to singly nested list of tensors.
"""
BatchedTensors = Union[List[NestedTensors], NestedTensors]
""" """
If each input tensor in the batch has the same size, this is a single batched If each input tensor in the batch has the same size, this is a single batched
tensor; otherwise, this is a list of tensors with one element per batch. tensor; otherwise, this is a list of :class:`NestedTensors` with one element
per item in the batch.
""" """
if sys.version_info < (3, 9): if sys.version_info < (3, 9):
...@@ -27,7 +34,7 @@ if sys.version_info < (3, 9): ...@@ -27,7 +34,7 @@ if sys.version_info < (3, 9):
pass pass
else: else:
class _MultiModalInputsBase(UserDict[str, torch.Tensor]): class _MultiModalInputsBase(UserDict[str, NestedTensors]):
pass pass
...@@ -39,19 +46,26 @@ class MultiModalInputs(_MultiModalInputsBase): ...@@ -39,19 +46,26 @@ class MultiModalInputs(_MultiModalInputsBase):
@staticmethod @staticmethod
def try_concat( def try_concat(
tensors: List[torch.Tensor], tensors: List[NestedTensors],
*, *,
device: torch.types.Device, device: torch.types.Device,
) -> BatchedTensors: ) -> BatchedTensors:
unbatched_shape = tensors[0].shape[1:] # may be list rather than tensors
if isinstance(tensors[0], list):
return [[t.to(device=device) for t in tensor[0]]
for tensor in tensors]
tensors_ = cast(List[torch.Tensor], tensors)
unbatched_shape = tensors_[0].shape[1:]
for tensor in tensors: for tensor in tensors_:
if tensor.shape[1:] != unbatched_shape: if tensor.shape[1:] != unbatched_shape:
return [ return [
tensor.squeeze(0).to(device=device) for tensor in tensors tensor.squeeze(0).to(device=device) for tensor in tensors_
] ]
return torch.cat(tensors, dim=0).to(device=device) return torch.cat(tensors_, dim=0).to(device=device)
@staticmethod @staticmethod
def batch( def batch(
...@@ -64,7 +78,7 @@ class MultiModalInputs(_MultiModalInputsBase): ...@@ -64,7 +78,7 @@ class MultiModalInputs(_MultiModalInputsBase):
keys = inputs_list[0].keys() keys = inputs_list[0].keys()
item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list) item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
for inputs in inputs_list: for inputs in inputs_list:
if inputs.keys() != keys: if inputs.keys() != keys:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment