Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
# SPDX-License-Identifier: Apache-2.0
import pytest
import pytest_asyncio
from huggingface_hub import snapshot_download
from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform
from .utils import ARGS, CONFIGS, ServerConfig
# for each server config, download the model and return the config
@pytest.fixture(scope="session", params=CONFIGS.keys())
def server_config(request):
config = CONFIGS[request.param]
if current_platform.is_rocm() and not config.get("supports_rocm", True):
pytest.skip("The {} model can't be tested on the ROCm platform".format(
config["model"]))
# download model and tokenizer using transformers
snapshot_download(config["model"])
yield CONFIGS[request.param]
# run this for each server config
@pytest.fixture(scope="session")
def server(request, server_config: ServerConfig):
model = server_config["model"]
args_for_model = server_config["arguments"]
with RemoteOpenAIServer(model, ARGS + args_for_model,
max_wait_seconds=480) as server:
yield server
@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
async with server.get_async_client() as async_client:
yield async_client
# SPDX-License-Identifier: Apache-2.0
import openai
import pytest
from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
# test: a tool_choice with mistral-tokenizer results in an ID of length 9
@pytest.mark.asyncio
async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
models = await client.models.list()
model_name: str = models.data[0].id
chat_completion = await client.chat.completions.create(
messages=MESSAGES_ASKING_FOR_TOOLS,
temperature=0,
max_completion_tokens=100,
model=model_name,
tools=[WEATHER_TOOL],
tool_choice=WEATHER_TOOL,
logprobs=False)
choice = chat_completion.choices[0]
assert choice.finish_reason != "tool_calls" # "stop" or "length"
assert choice.message.role == "assistant"
assert choice.message.tool_calls is None \
or len(choice.message.tool_calls) == 1
assert len(choice.message.tool_calls[0].id) == 9 # length of 9 for mistral
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List, Optional
from typing_extensions import TypedDict
class ServerConfig(TypedDict, total=False):
model: str
arguments: List[str]
system_prompt: Optional[str]
supports_parallel: Optional[bool]
supports_rocm: Optional[bool]
ARGS: List[str] = ["--max-model-len", "1024"]
CONFIGS: Dict[str, ServerConfig] = {
"mistral": {
"model":
"mistralai/Mistral-7B-Instruct-v0.3",
"arguments": [
"--tokenizer-mode", "mistral",
"--ignore-patterns=\"consolidated.safetensors\""
],
"system_prompt":
"You are a helpful assistant with access to tools. If a tool"
" that you have would be helpful to answer a user query, "
"call the tool. Otherwise, answer the user's query directly "
"without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
"to the user's question - just respond to it normally."
},
}
...@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner ...@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer, models_path_prefix from ....utils import RemoteOpenAIServer, models_path_prefix
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3") MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
AudioTuple = Tuple[np.ndarray, int] AudioTuple = Tuple[np.ndarray, int]
......
...@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" ...@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Test FP16 checkpoint w. fp8_e5m2 kv-cache. # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"), ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf")) os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
]) ])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("max_tokens", [4])
......
...@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams ...@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix from ....utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")] # This test is for the hybrid models
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -25,6 +26,10 @@ def test_models( ...@@ -25,6 +26,10 @@ def test_models(
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
# numeric error produces different generation
if 'Bamba' in model:
example_prompts.pop(3)
with hf_runner( with hf_runner(
model, model,
dtype=dtype, dtype=dtype,
...@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling( ...@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [10]) @pytest.mark.parametrize("max_tokens", [7])
def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts, def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
model: str, dtype: str, model: str, dtype: str,
max_tokens: int) -> None: max_tokens: int) -> None:
# numeric error during prefill chucking produces different generation # numeric error during prefill chucking produces different generation
# compared to w/o prefill chunking for those examples, removed them for now # compared to w/o prefill chunking for those examples, removed them for now
example_prompts.pop(7) if 'Jamba' in model:
example_prompts.pop(2) example_prompts.pop(7)
example_prompts.pop(1) example_prompts.pop(2)
example_prompts.pop(1)
elif 'Bamba' in model:
example_prompts.pop(6)
example_prompts.pop(3)
example_prompts.pop(2)
dtype = "half" # use a different dtype for Bamba
with hf_runner( with hf_runner(
model, model,
...@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts, ...@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [15]) @pytest.mark.parametrize("max_tokens", [15])
def test_parallel_sampling( def test_parallel_sampling(
vllm_runner, vllm_runner,
...@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks( ...@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
dtype: str, dtype: str,
example_prompts, example_prompts,
) -> None: ) -> None:
# This test is for verifying that the Jamba inner state management doesn't # This test is for verifying that the hybrid inner state management doesn't
# collapse in case where the number of incoming requests and # collapse in case where the number of incoming requests and
# finished_requests_ids is larger than the maximum mamba block capacity. # finished_requests_ids is larger than the maximum mamba block capacity.
# This could generally happen due to the fact that Jamba does support # This could generally happen due to the fact that hybrid does support
# statelessness mechanism where it can cleanup new incoming requests in # statelessness mechanism where it can cleanup new incoming requests in
# a single step. # a single step.
try: try:
with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
vllm_model.generate_greedy([example_prompts[0]] * 100, 10) vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
except ValueError: except ValueError:
pytest.fail("Jamba inner state wasn't cleaned up properly between" pytest.fail("Hybrid inner state wasn't cleaned up properly between"
"steps finished requests registered unnecessarily ") "steps finished requests registered unnecessarily ")
...@@ -273,14 +284,14 @@ def test_state_cleanup( ...@@ -273,14 +284,14 @@ def test_state_cleanup(
dtype: str, dtype: str,
example_prompts, example_prompts,
) -> None: ) -> None:
# This test is for verifying that the Jamba state is cleaned up between # This test is for verifying that the Hybrid state is cleaned up between
# steps, If its not cleaned, an error would be expected. # steps, If its not cleaned, an error would be expected.
try: try:
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype) as vllm_model:
for _ in range(10): for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1) vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError: except ValueError:
pytest.fail("Jamba inner state wasn't cleaned up between states, " pytest.fail("Hybrid inner state wasn't cleaned up between states, "
"could be related to finished_requests_ids") "could be related to finished_requests_ids")
...@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str, ...@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("max_tokens", [64])
def test_jamba_distributed_produces_identical_generation( def test_hybrid_distributed_produces_identical_generation(
vllm_runner, model: str, dtype: str, max_tokens: int, vllm_runner, model: str, dtype: str, max_tokens: int,
example_prompts) -> None: example_prompts) -> None:
......
...@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`. ...@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
""" """
import os import os
import pytest import pytest
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
...@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams ...@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix from ....utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"), os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev")] MODELS = [
os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
# TODO: Compare to a Mamba2 model. The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups.
# See https://github.com/huggingface/transformers/pull/35943
# "mistralai/Mamba-Codestral-7B-v0.1",
]
# Use lower-level interfaces to create this greedy generator, as mamba will # Use lower-level interfaces to create this greedy generator, as mamba will
...@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens): ...@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name)
# Set the device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Generate texts from the prompts # Generate texts from the prompts
outputs = [] outputs = []
for prompt in example_prompts: for prompt in example_prompts:
...@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens): ...@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
input_ids = inputs["input_ids"].to(model.device) input_ids = inputs["input_ids"].to(model.device)
# Generate text using the model's generate method directly # Generate text using the model's generate method directly
generated_ids = model.generate(input_ids, max_new_tokens=max_tokens) generated_ids = model.generate(input_ids,
max_new_tokens=max_tokens,
do_sample=False)
generated_text = tokenizer.decode(generated_ids[0], generated_text = tokenizer.decode(generated_ids[0],
skip_special_tokens=True) skip_special_tokens=True)
...@@ -52,7 +66,8 @@ def test_models( ...@@ -52,7 +66,8 @@ def test_models(
) -> None: ) -> None:
hf_outputs = generate_greedy(model, example_prompts, max_tokens) hf_outputs = generate_greedy(model, example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model: # Set max_num_seqs to keep Codestral from going OOM at fp32
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
# This test is for verifying whether the model's extra_repr # This test is for verifying whether the model's extra_repr
...@@ -83,7 +98,7 @@ def test_batching( ...@@ -83,7 +98,7 @@ def test_batching(
) -> None: ) -> None:
# To pass the small model tests, we need full precision. # To pass the small model tests, we need full precision.
for_loop_outputs = [] for_loop_outputs = []
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for prompt in example_prompts: for prompt in example_prompts:
for_loop_outputs.append( for_loop_outputs.append(
vllm_model.generate_greedy([prompt], max_tokens)[0]) vllm_model.generate_greedy([prompt], max_tokens)[0])
...@@ -167,20 +182,22 @@ def test_parallel_sampling( ...@@ -167,20 +182,22 @@ def test_parallel_sampling(
max_tokens: int, max_tokens: int,
) -> None: ) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model: # Numerical differences produce slightly different output for these
if 'state-spaces' in model:
example_prompts.pop(0)
example_prompts.pop(0)
example_prompts.pop(0)
with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for_loop_outputs = [] for_loop_outputs = []
for _ in range(10): for _ in range(10):
for_loop_outputs.append( for_loop_outputs.append(
# using example_prompts index 1 instead of 0 since with 0 the vllm_model.generate_greedy(example_prompts, max_tokens)[0])
# logprobs get really close and the test doesn't pass
vllm_model.generate_greedy([example_prompts[1]], max_tokens)
[0])
sampling_params = SamplingParams(n=10, sampling_params = SamplingParams(n=10,
temperature=0.001, temperature=0.001,
seed=0, seed=0,
max_tokens=max_tokens) max_tokens=max_tokens)
n_lt_1_outputs = vllm_model.generate([example_prompts[1]], n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
sampling_params)
token_ids, texts = n_lt_1_outputs[0] token_ids, texts = n_lt_1_outputs[0]
n_lt_1_outputs = [(token_id, text) n_lt_1_outputs = [(token_id, text)
for token_id, text in zip(token_ids, texts)] for token_id, text in zip(token_ids, texts)]
...@@ -234,7 +251,7 @@ def test_models_preemption_recompute( ...@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
# Tests that outputs are identical with and w/o preemtions (recompute) # Tests that outputs are identical with and w/o preemtions (recompute)
assert dtype == "float" assert dtype == "float"
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
vllm_model.model.llm_engine.scheduler[ vllm_model.model.llm_engine.scheduler[
0].ENABLE_ARTIFICIAL_PREEMPT = True 0].ENABLE_ARTIFICIAL_PREEMPT = True
preempt_vllm_outputs = vllm_model.generate_greedy( preempt_vllm_outputs = vllm_model.generate_greedy(
...@@ -285,7 +302,7 @@ def test_state_cleanup( ...@@ -285,7 +302,7 @@ def test_state_cleanup(
# This test is for verifying that the Mamba state is cleaned up between # This test is for verifying that the Mamba state is cleaned up between
# steps, If its not cleaned, an error would be expected. # steps, If its not cleaned, an error would be expected.
try: try:
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
for _ in range(10): for _ in range(10):
vllm_model.generate_greedy([example_prompts[0]] * 100, 1) vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
except ValueError: except ValueError:
......
...@@ -28,6 +28,9 @@ from ....utils import models_path_prefix ...@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"), # gemma os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"), # gemma
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
), ),
pytest.param(
os.path.join(models_path_prefix, "THUDM/chatglm3-6b"), # chatglm (text-only)
),
pytest.param( pytest.param(
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), # llama os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), # llama
marks=[pytest.mark.core_model, pytest.mark.cpu_model], marks=[pytest.mark.core_model, pytest.mark.cpu_model],
...@@ -45,6 +48,9 @@ from ....utils import models_path_prefix ...@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
os.path.join(models_path_prefix, "microsoft/phi-2"), # phi os.path.join(models_path_prefix, "microsoft/phi-2"), # phi
marks=[pytest.mark.core_model], marks=[pytest.mark.core_model],
), ),
pytest.param(
os.path.join(models_path_prefix, "Qwen/Qwen-7B"), # qwen (text-only)
),
pytest.param( pytest.param(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"), # qwen2 os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"), # qwen2
marks=[pytest.mark.core_model], marks=[pytest.mark.core_model],
...@@ -70,6 +76,10 @@ def test_models( ...@@ -70,6 +76,10 @@ def test_models(
) -> None: ) -> None:
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
if model.startswith("THUDM/chatglm3"):
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_outputs = hf_model.generate_greedy_logprobs_limit( hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs) example_prompts, max_tokens, num_logprobs)
......
...@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = { ...@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skipif( marks=[pytest.mark.core_model, pytest.mark.cpu_model],
TRANSFORMERS_VERSION < "4.49.0",
reason="HF model requires transformers>=4.49.0",
), pytest.mark.core_model, pytest.mark.cpu_model],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
...@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = { ...@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501 "cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}), }),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"), postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
...@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = { ...@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
...@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = { ...@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
test_type=(VLMTestType.IMAGE), test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501 prompt_formatter=identity,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
image_size_factors=[(),(1.0, 1.0, 1.0)], patch_hf_runner=model_utils.molmo_patch_hf_runner,
patch_hf_runner=model_utils.mlomo_patch_hf_runner,
postprocess_inputs=model_utils.molmo_post_processor, postprocess_inputs=model_utils.molmo_post_processor,
), ),
# Tests for phi3v currently live in another file because of a bug in # Tests for phi3v currently live in another file because of a bug in
...@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = { ...@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen": VLMTestInfo( "qwen_vl": VLMTestInfo(
models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")], models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity, prompt_formatter=identity,
......
...@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union ...@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import RunnerOutput from .types import RunnerOutput
...@@ -31,10 +33,8 @@ def run_test( ...@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos: bool, use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None], comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
List[int]]],
stop_str: Optional[List[str]], stop_str: Optional[List[str]],
tokenizer_mode: str,
limit_mm_per_prompt: Dict[str, int], limit_mm_per_prompt: Dict[str, int],
vllm_runner_kwargs: Optional[Dict[str, Any]], vllm_runner_kwargs: Optional[Dict[str, Any]],
hf_model_kwargs: Optional[Dict[str, Any]], hf_model_kwargs: Optional[Dict[str, Any]],
...@@ -48,7 +48,10 @@ def run_test( ...@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs.""" """Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors # In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
vllm_outputs_per_mm = [] vllm_outputs_per_mm = []
hf_outputs_per_mm = [] hf_outputs_per_mm = []
...@@ -57,17 +60,19 @@ def run_test( ...@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
if vllm_runner_kwargs is None: vllm_runner_kwargs_: Dict[str, Any] = {}
vllm_runner_kwargs = {} if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model, with vllm_runner(model,
tokenizer_mode=tokenizer_mode,
max_model_len=max_model_len, max_model_len=max_model_len,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
dtype=dtype, dtype=dtype,
...@@ -76,7 +81,15 @@ def run_test( ...@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
task=task, task=task,
**vllm_runner_kwargs) as vllm_model: **vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, media in vllm_inputs: for prompts, media in vllm_inputs:
vllm_kwargs[runner_mm_key] = media vllm_kwargs[runner_mm_key] = media
vllm_output = vllm_model.generate_greedy_logprobs( vllm_output = vllm_model.generate_greedy_logprobs(
...@@ -93,16 +106,19 @@ def run_test( ...@@ -93,16 +106,19 @@ def run_test(
if patch_hf_runner is not None: if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model) hf_model = patch_hf_runner(hf_model)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
with hf_model, torch.no_grad(): with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, media in inputs: for prompts, media in inputs:
hf_kwargs[runner_mm_key] = media hf_kwargs[runner_mm_key] = media
hf_output = hf_model.generate_greedy_logprobs_limit( hf_output = hf_model.generate_greedy_logprobs_limit(
......
...@@ -6,7 +6,7 @@ typically specific to a small subset of models. ...@@ -6,7 +6,7 @@ typically specific to a small subset of models.
import re import re
import types import types
from pathlib import PosixPath from pathlib import PosixPath
from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Callable, List, Optional, Tuple, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
...@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs ...@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from .....conftest import (HfRunner, ImageAsset, PromptAudioInput, from .....conftest import HfRunner, ImageAsset, _ImageAssets
PromptImageInput, PromptVideoInput, _ImageAssets)
from ....utils import TokensTextLogprobs
from .types import RunnerOutput from .types import RunnerOutput
...@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return hf_model return hf_model
def _generate_greedy_logprobs_limit( def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self,
prompts: List[str],
max_tokens: int,
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
# Process in batches for inference.
if len(all_inputs):
input_ids_lst = []
images_lst = []
images_input_idx_lst = []
imges_masks_lst = []
for inputs in all_inputs:
input_ids_lst.append(inputs["input_ids"])
images_lst.append(inputs["images"])
images_input_idx_lst.append(inputs["image_input_idx"])
imges_masks_lst.append(inputs["image_masks"])
batch_inputs = {}
batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
batch_inputs['images'] = torch.cat(images_lst, dim=0)
batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
dim=0)
batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
outputs = self.model.generate_from_batch(
batch=self.wrap_device(batch_inputs,
device=self.model.device.type),
generation_config=GenerationConfig(
max_new_tokens=max_tokens,
stop_strings="<|endoftext|>",
do_sample=False,
),
tokenizer=self.tokenizer,
output_hidden_states=True,
return_dict_in_generate=True,
)
all_logprobs: List[List[Dict[int, float]]] = []
all_output_ids: List[List[int]] = []
all_output_strs: List[str] = []
for index in range(len(all_inputs)):
(
seq_logprobs_lst,
output_len,
) = self._hidden_states_to_logprobs(outputs.hidden_states,
num_logprobs)
all_logprobs.append(seq_logprobs_lst)
seq_ids = outputs.sequences[index]
output_ids = seq_ids[-output_len:]
all_output_ids.append(output_ids.tolist())
all_output_strs.append(self.tokenizer.decode(output_ids))
outputs = zip(all_output_ids, all_output_strs, all_logprobs)
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
####### Molmo-specific HuggingFace runner patchers
def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Molmo.""" """Patches and returns an instance of the HfRunner to use for Molmo."""
hf_processor = hf_model.processor hf_processor = hf_model.processor
...@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: ...@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
hf_model.processor = _processor hf_model.processor = _processor
setattr( # noqa: B010 def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
hf_model, batch = {
"generate_greedy_logprobs_limit", k: kwargs.pop(k)
types.MethodType(_generate_greedy_logprobs_limit, hf_model), for k in ("input_ids", "images", "image_input_idx", "image_masks")
) if k in kwargs
}
return self.generate_from_batch(
batch,
generation_config=GenerationConfig(
max_new_tokens=max_new_tokens,
stop_strings="<|endoftext|>",
do_sample=do_sample,
),
**kwargs,
)
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
return hf_model return hf_model
...@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, ...@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from pytest import MarkDecorator from pytest import MarkDecorator
from transformers import (AutoModelForCausalLM, BatchEncoding, from transformers import AutoModelForCausalLM, BatchEncoding
PreTrainedTokenizerBase)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
...@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple): ...@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs: Optional[Dict[str, Any]] = None vllm_runner_kwargs: Optional[Dict[str, Any]] = None
# Optional callable which gets a list of token IDs from the model tokenizer # Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
List[int]]] = None
# Optional list of strings to stop generation, useful when stop tokens are # Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer # not special tokens in the tokenizer
stop_str: Optional[List[str]] = None stop_str: Optional[List[str]] = None
...@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple): ...@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks: Optional[List[MarkDecorator]] = None marks: Optional[List[MarkDecorator]] = None
tokenizer_mode: str = "auto"
def get_non_parametrized_runner_kwargs(self): def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used """Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized in all test types, which are NOT used when creating the parametrized
...@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple): ...@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs": self.hf_model_kwargs, "hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str, "stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner, "patch_hf_runner": self.patch_hf_runner,
"tokenizer_mode": self.tokenizer_mode
} }
......
...@@ -8,11 +8,11 @@ import torch ...@@ -8,11 +8,11 @@ import torch
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding) BatchEncoding)
from vllm import LLM, SamplingParams
from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.attention.selector import (_Backend, _cached_get_attn_backend, from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
global_force_attn_backend_context_manager) global_force_attn_backend_context_manager)
from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID, from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
MllamaForConditionalGeneration)
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close ...@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
_LIMIT_IMAGE_PER_PROMPT = 3 _LIMIT_IMAGE_PER_PROMPT = 3
MLLAMA_IMAGE_TOKEN_ID = 128256
LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
...@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, ...@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
) )
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [32])
def test_explicit_implicit_prompt(
image_assets: _ImageAssets,
model: str,
dtype: str,
max_tokens: int,
):
stop_sign = image_assets[0].pil_image
# yapf: disable
prompts = [
# explicit prompt
{
"encoder_prompt": {
"prompt": "<|image|>",
"multi_modal_data": {"image": stop_sign},
},
"decoder_prompt": {
"prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374], # noqa: E501
}
},
{
"encoder_prompt": "Not <|image|>",
"decoder_prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
},
# implicit prompt
{
"prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
"multi_modal_data": {"image": stop_sign},
},
{
"prompt": "The color of the sky is blue but sometimes it can also be", # noqa: E501
},
]
# yapf: enable
llm = LLM(
model=model,
dtype=dtype,
max_model_len=4096,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=max_tokens,
)
outputs = llm.generate(prompts, sampling_params)
n_prompts = len(prompts)
explicit_outputs = outputs[:n_prompts // 2]
implicit_outputs = outputs[n_prompts // 2:]
for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
assert exp_output.outputs[0].text == imp_output.outputs[0].text
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
...@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, ...@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
images=images) images=images)
class DummyModel:
image_token_id = MLLAMA_IMAGE_TOKEN_ID
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
"input_indices_and_output", "input_indices_and_output",
...@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None: ...@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
use_cuda_graph=False, use_cuda_graph=False,
) )
dummy: dict[str, str] = {} dummy = DummyModel()
cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\ cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
.get_cross_attention_mask(dummy, .get_cross_attention_mask(dummy,
...@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None: ...@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
use_cuda_graph=False, use_cuda_graph=False,
) )
dummy: dict[str, str] = {} dummy = DummyModel()
full_text_row_masked_out_mask = MllamaForConditionalGeneration\ full_text_row_masked_out_mask = MllamaForConditionalGeneration\
.get_full_text_row_masked_out_mask(dummy, .get_full_text_row_masked_out_mask(dummy,
......
...@@ -10,7 +10,7 @@ from vllm.config import ModelConfig ...@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import ProcessingCache from vllm.multimodal.processing import ProcessingCache
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....multimodal.utils import random_audio, random_image, random_video from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS from ...registry import HF_EXAMPLE_MODELS
...@@ -42,10 +42,7 @@ def _test_processing_correctness( ...@@ -42,10 +42,7 @@ def _test_processing_correctness(
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
ctx = InputProcessingContext( ctx = InputProcessingContext(
model_config, model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(model_config),
model_config.tokenizer,
trust_remote_code=model_info.trust_remote_code,
),
) )
# Ensure that it can fit all of the data # Ensure that it can fit all of the data
cache = ProcessingCache(capacity=1 << 30) cache = ProcessingCache(capacity=1 << 30)
...@@ -85,11 +82,19 @@ def _test_processing_correctness( ...@@ -85,11 +82,19 @@ def _test_processing_correctness(
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000), partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
} }
tokenizer_encode_kwargs = {}
if model_config.hf_config.model_type == "mllama":
# For Mllama, tokenizer will always add bos_token at the beginning of
# prompt by default, causing hf_processor outputs incorrect token ids.
# So we need use `add_special_tokens=False` here to leave bos_token
# to be added by the processor.
tokenizer_encode_kwargs = {"add_special_tokens": False}
for batch_idx in range(num_batches): for batch_idx in range(num_batches):
mm_data = { mm_data = {
k: k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit))] for _ in range(rng.randint(limit + 1))]
for k, limit in limit_mm_per_prompt.items() for k, limit in limit_mm_per_prompt.items()
} }
...@@ -122,7 +127,7 @@ def _test_processing_correctness( ...@@ -122,7 +127,7 @@ def _test_processing_correctness(
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
baseline_tokenized_result = baseline_processor.apply( baseline_tokenized_result = baseline_processor.apply(
tokenizer.encode(prompt), tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data, mm_data=mm_data,
hf_processor_mm_kwargs={}, hf_processor_mm_kwargs={},
) )
...@@ -131,7 +136,7 @@ def _test_processing_correctness( ...@@ -131,7 +136,7 @@ def _test_processing_correctness(
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})") f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
cached_tokenized_result = cached_processor.apply( cached_tokenized_result = cached_processor.apply(
tokenizer.encode(prompt), tokenizer.encode(prompt, **tokenizer_encode_kwargs),
mm_data=mm_data, mm_data=mm_data,
hf_processor_mm_kwargs={}, hf_processor_mm_kwargs={},
) )
...@@ -147,6 +152,7 @@ def _test_processing_correctness( ...@@ -147,6 +152,7 @@ def _test_processing_correctness(
"facebook/chameleon-7b", "facebook/chameleon-7b",
"deepseek-ai/deepseek-vl2-tiny", "deepseek-ai/deepseek-vl2-tiny",
"adept/fuyu-8b", "adept/fuyu-8b",
"THUDM/glm-4v-9b",
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
"OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-1B",
"HuggingFaceM4/Idefics3-8B-Llama3", "HuggingFaceM4/Idefics3-8B-Llama3",
...@@ -154,16 +160,19 @@ def _test_processing_correctness( ...@@ -154,16 +160,19 @@ def _test_processing_correctness(
"llava-hf/llava-v1.6-mistral-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf", "llava-hf/LLaVA-NeXT-Video-7B-hf",
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
"meta-llama/Llama-3.2-11B-Vision-Instruct",
"TIGER-Lab/Mantis-8B-siglip-llama3", "TIGER-Lab/Mantis-8B-siglip-llama3",
"mistral-community/pixtral-12b", "mistral-community/pixtral-12b",
"openbmb/MiniCPM-o-2_6", "openbmb/MiniCPM-o-2_6",
"openbmb/MiniCPM-V-2_6", "openbmb/MiniCPM-V-2_6",
"allenai/Molmo-7B-D-0924",
"allenai/Molmo-7B-O-0924",
"nvidia/NVLM-D-72B", "nvidia/NVLM-D-72B",
"Qwen/Qwen-VL-Chat", "Qwen/Qwen-VL-Chat",
"Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2-VL-2B-Instruct",
"Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
"Qwen/Qwen2-Audio-7B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct",
"fixie-ai/ultravox-v0_3", "fixie-ai/ultravox-v0_5-llama-3_2-1b",
]) ])
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("num_batches", [32])
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Tests for H2OVL's multimodal preprocessing kwargs.""" """Tests for H2OVL's multimodal preprocessing kwargs."""
from typing import Optional from typing import Mapping, Optional
import pytest import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import rescale_image_size from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
width, height = image.size
# Calculate the expected number of blocks
if num_imgs == 1 and config.use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=1,
max_num=max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num=3,
max_num=max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
return total_blocks
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [ @pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m", "h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b", "h2oai/h2ovl-mississippi-2b",
...@@ -25,118 +126,54 @@ from ...utils import build_model_context ...@@ -25,118 +126,54 @@ from ...utils import build_model_context
[1.0, 1.0, 1.0], [1.0, 1.0, 1.0],
# Multi-scale # Multi-scale
[0.25, 0.5, 1.0], [0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
], ],
) )
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8]) @pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False]) @pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: _ImageAssets,
size_factors: list[int], size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,
dynamic_image_size: Optional[bool], dynamic_image_size: Optional[bool],
num_imgs: int, kwargs_on_init: bool,
): ):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets, mm_processor_kwargs = {
get_h2ovl_target_ratios) "min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
) )
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
config = processor.info.get_hf_config() min_num = min_dynamic_patch if dynamic_image_size else 1
use_msac = config.use_msac
mm_processor_kwargs = {
"max_dynamic_patch": max_dynamic_patch,
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
min_num = config.min_dynamic_patch
max_num = max_dynamic_patch if dynamic_image_size else 1 max_num = max_dynamic_patch if dynamic_image_size else 1
# Build the image str / prompt based on the number of images we pass _run_check(
prompt = "<image>" * num_imgs processor,
[
for asset in image_assets: rescale_image_size(image_assets[0].pil_image, f)
for factor in size_factors: for f in size_factors
image = rescale_image_size(asset.pil_image, factor) ],
mm_data = {"image": [image] * num_imgs} min_num,
max_num,
width, height = image.size hf_processor_mm_kwargs,
)
# Calculate the expected number of blocks
if num_imgs == 1 and use_msac:
# First pass
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False, # Thumbnail is handled separately
)
# Second pass
blocks2, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=aspect_ratio,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
# Add thumbnail if use_thumbnail is True and total_blocks > 1
if config.use_thumbnail:
blocks1 += 1 if blocks1 > 1 else 0
blocks2 += 1 if blocks2 > 1 else 0
# Total blocks is the sum of blocks from both passes minus
# overlapping
total_blocks = blocks1 + blocks2 - 1
expected_num_patches = total_blocks
else:
blocks, _, _, _ = calculate_h2ovl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_h2ovl_target_ratios(
min_num,
max_num,
prior_aspect_ratio=None,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches != 1:
expected_num_patches += 1
processed_inputs = processor.apply(prompt, mm_data,
mm_processor_kwargs)
pixel_shape = (
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
assert pixel_shape[0] == expected_num_patches * num_imgs
...@@ -5,7 +5,7 @@ import pytest ...@@ -5,7 +5,7 @@ import pytest
from transformers import Idefics3Config from transformers import Idefics3Config
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")] ...@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
]) ])
# yapf: enable # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(image_assets: _ImageAssets, model: str, @pytest.mark.parametrize("kwargs_on_init", [True, False])
mm_processor_kwargs: dict[str, object], def test_processor_override(
expected_toks_per_img: int, num_imgs: int): image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: dict[str, object],
expected_toks_per_img: int,
num_imgs: int,
kwargs_on_init: bool,
):
"""Ensure input_processor_for_idefics3 handles num_crops properly.""" """Ensure input_processor_for_idefics3 handles num_crops properly."""
# Same as the previous test - don't initialize mm_processor_kwargs # Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by # in this test and assume that the kwargs will be correctly expanded by
...@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str, ...@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
model_name=model, model_name=model,
tokenizer_name=model, tokenizer_name=model,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join( placeholders = "<image>" if num_imgs == 1 else "\n".join(
...@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str, ...@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
dummy_image = image_assets[0].pil_image.resize(dummy_image_size) dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs} mm_data = {"image": [dummy_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure the placeholders format are correct # Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"]) hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[ assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0] "input_ids"][0]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Tests for InternVL's multimodal preprocessing kwargs.""" """Tests for InternVL's multimodal preprocessing kwargs."""
from typing import Optional from typing import Mapping, Optional
import os import os
import pytest import pytest
from PIL import Image
from transformers import PretrainedConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
from ....utils import models_path_prefix from ....utils import models_path_prefix
def _get_expected_num_patches(
config: PretrainedConfig,
image: Image.Image,
num_imgs: int,
min_num: int,
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
width, height = image.size
blocks, _, _ = calculate_internvl_targets(
orig_width=width,
orig_height=height,
target_ratios=get_internvl_target_ratios(
min_num,
max_num,
),
image_size=config.vision_config.image_size,
use_thumbnail=False,
)
expected_num_patches = blocks
if config.use_thumbnail and expected_num_patches > 1:
expected_num_patches += 1
return expected_num_patches
def _run_check(
processor: BaseMultiModalProcessor,
images: list[Image.Image],
min_num: int,
max_num: int,
mm_processor_kwargs: Mapping[str, object],
):
tokenizer = processor.info.get_tokenizer()
config = processor.info.get_hf_config()
mm_data = {"image": images}
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
processed_inputs = processor.apply("<image>" * len(images), mm_data,
mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")]) @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")])
@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) @pytest.mark.parametrize(
@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) "size_factors",
@pytest.mark.parametrize("num_imgs", [1, 2]) [
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
[4.0, 2.0, 1.0],
],
)
@pytest.mark.parametrize(
("min_dynamic_patch", "max_dynamic_patch"),
[(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
model_id: str, model_id: str,
image_assets: _ImageAssets, image_assets: _ImageAssets,
size_factors: list[int],
min_dynamic_patch: int,
max_dynamic_patch: int, max_dynamic_patch: int,
dynamic_image_size: Optional[bool], dynamic_image_size: Optional[bool],
num_imgs: int, kwargs_on_init: bool,
): ):
mm_processor_kwargs = {
"min_dynamic_patch": min_dynamic_patch,
"max_dynamic_patch": max_dynamic_patch,
"dynamic_image_size": dynamic_image_size,
}
ctx = build_model_context( ctx = build_model_context(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=None, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": len(size_factors)},
)
tokenizer = cached_get_tokenizer(
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
) )
tokenizer = cached_tokenizer_from_config(ctx.model_config)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
mm_processor_kwargs = { min_num = min_dynamic_patch if dynamic_image_size else 1
"max_dynamic_patch": max_dynamic_patch, max_num = max_dynamic_patch if dynamic_image_size else 1
}
if dynamic_image_size is not None:
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
# Build the image str / prompt based on the number of images we pass _run_check(
prompt = "<image>" * num_imgs processor,
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) [
mm_data = {"image": [image] * num_imgs} rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 ],
if dynamic_image_size is False: min_num,
expected_num_patches = 1 max_num,
hf_processor_mm_kwargs,
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) )
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
assert img_tok_count == 256 * expected_num_patches * num_imgs
assert pixel_shape[0] == expected_num_patches * num_imgs
...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm ...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context from ...utils import build_model_context
...@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id): ...@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
info = processor.info info = processor.info
...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
...@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ...@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
seen_aspect_ratios = set[float]() seen_aspect_ratios = set[float]()
......
...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm ...@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.parse import ImageSize from vllm.multimodal.parse import ImageSize
from vllm.multimodal.processing import BaseMultiModalProcessor from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ...utils import build_model_context from ...utils import build_model_context
...@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id): ...@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
info = processor.info info = processor.info
...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): ...@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328), image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
...@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): ...@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
) )
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=cached_get_tokenizer( tokenizer=cached_tokenizer_from_config(ctx.model_config),
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
),
) )
seen_aspect_ratios = set[float]() seen_aspect_ratios = set[float]()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import pytest import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from ....conftest import _ImageAssets from ....conftest import _ImageAssets
from ...utils import build_model_context from ...utils import build_model_context
...@@ -21,12 +21,14 @@ from ...utils import build_model_context ...@@ -21,12 +21,14 @@ from ...utils import build_model_context
]) ])
# yapf: enable # yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2]) @pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
def test_processor_override( def test_processor_override(
image_assets: _ImageAssets, image_assets: _ImageAssets,
model_id: str, model_id: str,
mm_processor_kwargs: dict[str, int], mm_processor_kwargs: dict[str, int],
expected_toks_per_img: int, expected_toks_per_img: int,
num_imgs: int, num_imgs: int,
kwargs_on_init: bool,
): ):
"""Ensure input_processor_for_phi3v handles num_crops properly.""" """Ensure input_processor_for_phi3v handles num_crops properly."""
# Avoid initializing CUDA early # Avoid initializing CUDA early
...@@ -36,23 +38,22 @@ def test_processor_override( ...@@ -36,23 +38,22 @@ def test_processor_override(
model_name=model_id, model_name=model_id,
tokenizer_name=model_id, tokenizer_name=model_id,
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs}, limit_mm_per_prompt={"image": num_imgs},
) )
tokenizer = cached_get_tokenizer( tokenizer = cached_tokenizer_from_config(ctx.model_config)
ctx.model_config.tokenizer,
trust_remote_code=ctx.model_config.trust_remote_code,
)
processor = MULTIMODAL_REGISTRY.create_processor( processor = MULTIMODAL_REGISTRY.create_processor(
ctx.model_config, ctx.model_config,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass # Build the image str / prompt based on the number of images we pass
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
mm_data = {"image": [image_assets[0].pil_image] * num_imgs} mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
# Ensure we have the right number of placeholders per num_crops size # Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment