Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
# SPDX-License-Identifier: Apache-2.0
import pytest
import pytest_asyncio
from huggingface_hub import snapshot_download
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
from .utils import ARGS, CONFIGS, ServerConfig
# for each server config, download the model and return the config
@pytest.fixture(scope="session", params=CONFIGS.keys())
def server_config(request):
config = CONFIGS[request.param]
if current_platform.is_rocm() and not config.get("supports_rocm", True):
pytest.skip("The {} model can't be tested on the ROCm platform".format(
config["model"]))
# download model and tokenizer using transformers
snapshot_download(config["model"])
yield CONFIGS[request.param]
# run this for each server config
@pytest.fixture(scope="session")
def server(request, server_config: ServerConfig):
model = server_config["model"]
args_for_model = server_config["arguments"]
with RemoteOpenAIServer(model, ARGS + args_for_model,
max_wait_seconds=480) as server:
yield server
@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
async with server.get_async_client() as async_client:
yield async_client
# SPDX-License-Identifier: Apache-2.0
import openai
import pytest
from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
# test: a tool_choice with mistral-tokenizer results in an ID of length 9
@pytest.mark.asyncio
async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
messages=MESSAGES_ASKING_FOR_TOOLS,
temperature=0,
max_completion_tokens=100,
model=model_name,
tools=[WEATHER_TOOL],
tool_choice=WEATHER_TOOL,
logprobs=False)
choice = chat_completion.choices[0]
assert choice.finish_reason != "tool_calls" # "stop" or "length"
assert choice.message.role == "assistant"
assert choice.message.tool_calls is None \
or len(choice.message.tool_calls) == 1
assert len(choice.message.tool_calls[0].id) == 9 # length of 9 for mistral
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List, Optional
from typing_extensions import TypedDict
class ServerConfig(TypedDict, total=False):
model: str
arguments: List[str]
system_prompt: Optional[str]
supports_parallel: Optional[bool]
supports_rocm: Optional[bool]
ARGS: List[str] = ["--max-model-len", "1024"]
CONFIGS: Dict[str, ServerConfig] = {
"mistral": {
"model":
"mistralai/Mistral-7B-Instruct-v0.3",
"arguments": [
"--tokenizer-mode", "mistral",
"--ignore-patterns=\"consolidated.safetensors\""
],
"system_prompt":
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
}
......@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer, models_path_prefix
from ...utils import check_logprobs_close
MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
AudioTuple = Tuple[np.ndarray, int]
......
......@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"))
# Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
......
......@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
# This test is for the hybrid models
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B")]
@pytest.mark.parametrize("model", MODELS)
......@@ -25,6 +26,10 @@ def test_models(
max_tokens: int,
) -> None:
# numeric error produces different generation
if 'Bamba' in model:
example_prompts.pop(3)
with hf_runner(
model,
dtype=dtype,
......@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("max_tokens", [7])
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
model: str, dtype: str,
max_tokens: int) -> None:
# numeric error during prefill chucking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now
example_prompts.pop(7)
example_prompts.pop(2)
example_prompts.pop(1)
if 'Jamba' in model:
example_prompts.pop(7)
example_prompts.pop(2)
example_prompts.pop(1)
elif 'Bamba' in model:
example_prompts.pop(6)
example_prompts.pop(3)
example_prompts.pop(2)
dtype = "half" # use a different dtype for Bamba
with hf_runner(
model,
......@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [15])
def test_parallel_sampling(
vllm_runner,
......@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Jamba inner state management doesn't
# This test is for verifying that the hybrid inner state management doesn't
# collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that Jamba does support
# This could generally happen due to the fact that hybrid does support
# statelessness mechanism where it can cleanup new incoming requests in
# a single step.
try:
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError:
pytest.fail("Jamba inner state wasn't cleaned up properly between"
pytest.fail("Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ")
......@@ -273,14 +284,14 @@ def test_state_cleanup(
dtype: str,
example_prompts,
) -> None:
# This test is for verifying that the Jamba state is cleaned up between
# This test is for verifying that the Hybrid state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try:
with vllm_runner(model, dtype=dtype) as vllm_model:
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
pytest.fail("Jamba inner state wasn't cleaned up between states, "
pytest.fail("Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids")
......@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64])
def test_jamba_distributed_produces_identical_generation(
def test_hybrid_distributed_produces_identical_generation(
vllm_runner, model: str, dtype: str, max_tokens: int,
example_prompts) -> None:
......
......@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
"""
import os
import pytest
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.engine.arg_utils import EngineArgs
......@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"), os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev")]
MODELS = [
os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
# See https://github.com/huggingface/transformers/pull/35943
# "mistralai/Mamba-Codestral-7B-v0.1",
]
# Use lower-level interfaces to create this greedy generator, as mamba will
......@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Generate texts from the prompts
outputs = []
for prompt in example_prompts:
......@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
input_ids = inputs["input_ids"].to(model.device)
# Generate text using the model's generate method directly
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
generated_ids = model.generate(input_ids,
max_new_tokens=max_tokens,
do_sample=False)
generated_text = tokenizer.decode(generated_ids[0],
skip_special_tokens=True)
......@@ -52,7 +66,8 @@ def test_models(
) -> None:
hf_outputs = generate_greedy(model, example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model:
# Set max_num_seqs to keep Codestral from going OOM at fp32
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr
......@@ -83,7 +98,7 @@ def test_batching(
) -> None:
# To pass the small model tests, we need full precision.
for_loop_outputs = []
with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for prompt in example_prompts:
for_loop_outputs.append(
vllm_model.generate_greedy([prompt], max_tokens)[0])
......@@ -167,20 +182,22 @@ def test_parallel_sampling(
max_tokens: int,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# Numerical differences produce slightly different output for these
if 'state-spaces' in model:
example_prompts.pop(0)
example_prompts.pop(0)
example_prompts.pop(0)
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for_loop_outputs = []
for _ in range(10):
for_loop_outputs.append(
# using example_prompts index 1 instead of 0 since with 0 the
# logprobs get really close and the test doesn't pass
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
[0])
vllm_model.generate_greedy(example_prompts, max_tokens)[0])
sampling_params = SamplingParams(n=10,
temperature=0.001,
seed=0,
max_tokens=max_tokens)
n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
sampling_params)
n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
token_ids, texts = n_lt_1_outputs[0]
n_lt_1_outputs = [(token_id, text)
for token_id, text in zip(token_ids, texts)]
......@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
# Tests that outputs are identical with and w/o preemtions (recompute)
assert dtype == "float"
with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy(
......@@ -285,7 +302,7 @@ def test_state_cleanup(
# This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected.
try:
with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError:
......
......@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"), # gemma
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
pytest.param(
os.path.join(models_path_prefix, "THUDM/chatglm3-6b"), # chatglm (text-only)
),
pytest.param(
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), # llama
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
......@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
os.path.join(models_path_prefix, "microsoft/phi-2"), # phi
marks=[pytest.mark.core_model],
),
pytest.param(
os.path.join(models_path_prefix, "Qwen/Qwen-7B"), # qwen (text-only)
),
pytest.param(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"), # qwen2
marks=[pytest.mark.core_model],
......@@ -70,6 +76,10 @@ def test_models(
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
if model.startswith("THUDM/chatglm3"):
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
......
......@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skipif(
TRANSFORMERS_VERSION < "4.49.0",
reason="HF model requires transformers>=4.49.0",
), pytest.mark.core_model, pytest.mark.cpu_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Extended model tests
"aria": VLMTestInfo(
......@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
......@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501
get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
......@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],
test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
prompt_formatter=identity,
max_model_len=4096,
max_num_seqs=2,
image_size_factors=[(),(1.0, 1.0, 1.0)],
patch_hf_runner=model_utils.mlomo_patch_hf_runner,
patch_hf_runner=model_utils.molmo_patch_hf_runner,
postprocess_inputs=model_utils.molmo_post_processor,
),
# Tests for phi3v currently live in another file because of a bug in
......@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)],
),
"qwen": VLMTestInfo(
"qwen_vl": VLMTestInfo(
models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity,
......
......@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import torch
from PIL.Image import Image
from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import RunnerOutput
......@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
List[int]]],
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
stop_str: Optional[List[str]],
tokenizer_mode: str,
limit_mm_per_prompt: Dict[str, int],
vllm_runner_kwargs: Optional[Dict[str, Any]],
hf_model_kwargs: Optional[Dict[str, Any]],
......@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
......@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
if vllm_runner_kwargs is None:
vllm_runner_kwargs = {}
vllm_runner_kwargs_: Dict[str, Any] = {}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model,
tokenizer_mode=tokenizer_mode,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
......@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
task=task,
**vllm_runner_kwargs) as vllm_model:
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, media in vllm_inputs:
vllm_kwargs[runner_mm_key] = media
vllm_output = vllm_model.generate_greedy_logprobs(
......@@ -93,16 +106,19 @@ def run_test(
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, media in inputs:
hf_kwargs[runner_mm_key] = media
hf_output = hf_model.generate_greedy_logprobs_limit(
......
......@@ -6,7 +6,7 @@ typically specific to a small subset of models.
import re
import types
from pathlib import PosixPath
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Callable, List, Optional, Tuple, Union
import torch
from PIL.Image import Image
......@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
PromptImageInput, PromptVideoInput, _ImageAssets)
from ....utils import TokensTextLogprobs
from .....conftest import HfRunner, ImageAsset, _ImageAssets
from .types import RunnerOutput
......@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model
def _generate_greedy_logprobs_limit(
self,
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
# Process in batches for inference.
if len(all_inputs):
input_ids_lst = []
images_lst = []
images_input_idx_lst = []
imges_masks_lst = []
for inputs in all_inputs:
input_ids_lst.append(inputs["input_ids"])
images_lst.append(inputs["images"])
images_input_idx_lst.append(inputs["image_input_idx"])
imges_masks_lst.append(inputs["image_masks"])
batch_inputs = {}
batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
batch_inputs['images'] = torch.cat(images_lst, dim=0)
batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
dim=0)
batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
outputs = self.model.generate_from_batch(
batch=self.wrap_device(batch_inputs,
device=self.model.device.type),
generation_config=GenerationConfig(
max_new_tokens=max_tokens,
stop_strings="<|endoftext|>",
do_sample=False,
),
tokenizer=self.tokenizer,
output_hidden_states=True,
return_dict_in_generate=True,
)
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for index in range(len(all_inputs)):
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(outputs.hidden_states,
num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = outputs.sequences[index]
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
####### Molmo-specific HuggingFace runner patchers
def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor = hf_model.processor
......@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model.processor = _processor
setattr( # noqa: B010
hf_model,
"generate_greedy_logprobs_limit",
types.MethodType(_generate_greedy_logprobs_limit, hf_model),
)
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
batch = {
k: kwargs.pop(k)
for k in ("input_ids", "images", "image_input_idx", "image_masks")
if k in kwargs
}
return self.generate_from_batch(
batch,
generation_config=GenerationConfig(
max_new_tokens=max_new_tokens,
stop_strings="<|endoftext|>",
do_sample=do_sample,
),
**kwargs,
)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model
......@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import torch
from PIL.Image import Image
from pytest import MarkDecorator
from transformers import (AutoModelForCausalLM, BatchEncoding,
PreTrainedTokenizerBase)
from transformers import AutoModelForCausalLM, BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
......@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs: Optional[Dict[str, Any]] = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
List[int]]] = None
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: Optional[List[str]] = None
......@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks: Optional[List[MarkDecorator]] = None
tokenizer_mode: str = "auto"
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
......@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner,
"tokenizer_mode": self.tokenizer_mode
}
......
......@@ -8,11 +8,11 @@ import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding)
from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
global_force_attn_backend_context_manager)
from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
MllamaForConditionalGeneration)
from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs
......@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
from ....utils import models_path_prefix
_LIMIT_IMAGE_PER_PROMPT = 3
MLLAMA_IMAGE_TOKEN_ID = 128256
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
......@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
)
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
def test_explicit_implicit_prompt(
image_assets: _ImageAssets,
model: str,
dtype: str,
max_tokens: int,
):
stop_sign = image_assets[0].pil_image
# yapf: disable
prompts = [
# explicit prompt
{
"encoder_prompt": {
"prompt": "<|image|>",
"multi_modal_data": {"image": stop_sign},
},
"decoder_prompt": {
"prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501
}
},
{
"encoder_prompt": "Not <|image|>",
"decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
},
# implicit prompt
{
"prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
"multi_modal_data": {"image": stop_sign},
},
{
"prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
},
]
# yapf: enable
llm = LLM(
model=model,
dtype=dtype,
max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=max_tokens,
)
outputs = llm.generate(prompts, sampling_params)
n_prompts = len(prompts)
explicit_outputs = outputs[:n_prompts // 2]
implicit_outputs = outputs[n_prompts // 2:]
for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
assert exp_output.outputs[0].text == imp_output.outputs[0].text
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
......@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
images=images)
class DummyModel:
image_token_id = MLLAMA_IMAGE_TOKEN_ID
@pytest.mark.core_model
@pytest.mark.parametrize(
"input_indices_and_output",
......@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
use_cuda_graph=False,
)
dummy: dict[str, str] = {}
dummy = DummyModel()
cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
.get_cross_attention_mask(dummy,
......@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
use_cuda_graph=False,
)
dummy: dict[str, str] = {}
dummy = DummyModel()
full_text_row_masked_out_mask = MllamaForConditionalGeneration\
.get_full_text_row_masked_out_mask(dummy,
......
......@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
......@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext(
model_config,
tokenizer=cached_get_tokenizer(
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(model_config),
)
# Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30)
......@@ -85,11 +82,19 @@ def _test_processing_correctness(
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
tokenizer_encode_kwargs = {}
if model_config.hf_config.model_type == "mllama":
# For Mllama, tokenizer will always add bos_token at the beginning of
# prompt by default, causing hf_processor outputs incorrect token ids.
# So we need use `add_special_tokens=False` here to leave bos_token
# to be added by the processor.
tokenizer_encode_kwargs = {"add_special_tokens": False}
for batch_idx in range(num_batches):
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit))]
for _ in range(rng.randint(limit + 1))]
for k, limit in limit_mm_per_prompt.items()
}
......@@ -122,7 +127,7 @@ def _test_processing_correctness(
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
baseline_tokenized_result = baseline_processor.apply(
tokenizer.encode(prompt),
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
......@@ -131,7 +136,7 @@ def _test_processing_correctness(
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
cached_tokenized_result = cached_processor.apply(
tokenizer.encode(prompt),
tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data,
hf_processor_mm_kwargs={},
)
......@@ -147,6 +152,7 @@ def _test_processing_correctness(
"facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny",
"adept/fuyu-8b",
"THUDM/glm-4v-9b",
"h2oai/h2ovl-mississippi-800m",
"OpenGVLab/InternVL2-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
......@@ -154,16 +160,19 @@ def _test_processing_correctness(
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"TIGER-Lab/Mantis-8B-siglip-llama3",
"mistral-community/pixtral-12b",
"openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6",
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B",
"Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_3",
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
......
# SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from typing import Optional
from typing import Mapping, Optional
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and config.use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=1,
max_num=max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=3,
max_num=max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
return total_blocks
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
......@@ -25,118 +126,54 @@ from ...utils import build_model_context
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
num_imgs: int,
kwargs_on_init: bool,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
config = processor.info.get_hf_config()
use_msac = config.use_msac
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
min_num = config.min_dynamic_patch
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
# Build the image str / prompt based on the number of images we pass
prompt = "<image>" * num_imgs
for asset in image_assets:
for factor in size_factors:
image = rescale_image_size(asset.pil_image, factor)
mm_data = {"image": [image] * num_imgs}
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
expected_num_patches = total_blocks
else:
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches != 1:
expected_num_patches += 1
processed_inputs = processor.apply(prompt, mm_data,
mm_processor_kwargs)
pixel_shape = (
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
assert pixel_shape[0] == expected_num_patches * num_imgs
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)
......@@ -5,7 +5,7 @@ import pytest
from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
......@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(image_assets: _ImageAssets, model: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int, num_imgs: int):
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
......@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name=model,
tokenizer_name=model,
trust_remote_code=True,
mm_processor_kwargs=None,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
......@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]
......
# SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs."""
from typing import Optional
from typing import Mapping, Optional
import os
import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
from ....utils import models_path_prefix
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")])
@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize(
"size_factors",
[
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
model_id: str,
image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int,
dynamic_image_size: Optional[bool],
num_imgs: int,
kwargs_on_init: bool,
):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
min_num = min_dynamic_patch if dynamic_image_size else 1
max_num = max_dynamic_patch if dynamic_image_size else 1
# Build the image str / prompt based on the number of images we pass
prompt = "<image>" * num_imgs
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
mm_data = {"image": [image] * num_imgs}
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
if dynamic_image_size is False:
expected_num_patches = 1
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * expected_num_patches * num_imgs
assert pixel_shape[0] == expected_num_patches * num_imgs
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
min_num,
max_num,
hf_processor_mm_kwargs,
)
......@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context
......@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
info = processor.info
......@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
......@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
seen_aspect_ratios = set[float]()
......
......@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context
......@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
info = processor.info
......@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
......@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
tokenizer=cached_tokenizer_from_config(ctx.model_config),
)
seen_aspect_ratios = set[float]()
......
......@@ -3,7 +3,7 @@
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets
from ...utils import build_model_context
......@@ -21,12 +21,14 @@ from ...utils import build_model_context
])
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model_id: str,
mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early
......@@ -36,23 +38,22 @@ def test_processor_override(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config,
tokenizer=tokenizer,
)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment