Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/tests/mistral_tool_use/conftest.py
+++ b/tests/mistral_tool_use/conftest.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import pytest_asyncio
+from huggingface_hub import snapshot_download
+from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+from .utils import ARGS, CONFIGS, ServerConfig
+# for each server config, download the model and return the config
+@pytest.fixture(scope="session", params=CONFIGS.keys())
+def server_config(request):
+    config = CONFIGS[request.param]
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+    # download model and tokenizer using transformers
+    snapshot_download(config["model"])
+    yield CONFIGS[request.param]
+# run this for each server config
+@pytest.fixture(scope="session")
+def server(request, server_config: ServerConfig):
+    model = server_config["model"]
+    args_for_model = server_config["arguments"]
+    with RemoteOpenAIServer(model, ARGS + args_for_model,
+                            max_wait_seconds=480) as server:
+        yield server
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    async with server.get_async_client() as async_client:
+        yield async_client
--- a/tests/mistral_tool_use/test_mistral_tool_calls.py
+++ b/tests/mistral_tool_use/test_mistral_tool_calls.py
+# SPDX-License-Identifier: Apache-2.0
+import openai
+import pytest
+from tests.tool_use.utils import MESSAGES_ASKING_FOR_TOOLS, WEATHER_TOOL
+# test: a tool_choice with mistral-tokenizer results in an ID of length 9
+@pytest.mark.asyncio
+async def test_tool_call_with_tool_choice(client: openai.AsyncOpenAI):
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_TOOLS,
+        temperature=0,
+        max_completion_tokens=100,
+        model=model_name,
+        tools=[WEATHER_TOOL],
+        tool_choice=WEATHER_TOOL,
+        logprobs=False)
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason != "tool_calls"  # "stop" or "length"
+    assert choice.message.role == "assistant"
+    assert choice.message.tool_calls is None \
+           or len(choice.message.tool_calls) == 1
+    assert len(choice.message.tool_calls[0].id) == 9  # length of 9 for mistral
--- a/tests/mistral_tool_use/utils.py
+++ b/tests/mistral_tool_use/utils.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Optional
+from typing_extensions import TypedDict
+class ServerConfig(TypedDict, total=False):
+    model: str
+    arguments: List[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
+ARGS: List[str] = ["--max-model-len", "1024"]
+CONFIGS: Dict[str, ServerConfig] = {
+    "mistral": {
+        "model":
+        "mistralai/Mistral-7B-Instruct-v0.3",
+        "arguments": [
+            "--tokenizer-mode", "mistral",
+            "--ignore-patterns=\"consolidated.safetensors\""
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
+    },
+}
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -18,7 +18,7 @@ from ....conftest import HfRunner, VllmRunner
 from ....utils import RemoteOpenAIServer, models_path_prefix
 from ...utils import check_logprobs_close
-MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
+MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
 AudioTuple = Tuple[np.ndarray, int]

--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -30,9 +30,9 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
        # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
        ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
         os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")),
-        # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
+        # Test BF16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
-        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
+        ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
-         os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"))
+         os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"))
    ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])

--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -10,7 +10,8 @@ from vllm.sampling_params import SamplingParams
 from ...utils import check_outputs_equal
 from ....utils import models_path_prefix
-MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
+# This test is for the hybrid models
+MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-dev"), os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B")]
 @pytest.mark.parametrize("model", MODELS)
@@ -25,6 +26,10 @@ def test_models(
    max_tokens: int,
 ) -> None:
+    # numeric error produces different generation
+    if 'Bamba' in model:
+        example_prompts.pop(3)
    with hf_runner(
            model,
            dtype=dtype,
@@ -110,15 +115,21 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [7])
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                model: str, dtype: str,
                                max_tokens: int) -> None:
    # numeric error during prefill chucking produces different generation
    # compared to w/o prefill chunking for those examples, removed them for now
-    example_prompts.pop(7)
+    if 'Jamba' in model:
-    example_prompts.pop(2)
+        example_prompts.pop(7)
-    example_prompts.pop(1)
+        example_prompts.pop(2)
+        example_prompts.pop(1)
+    elif 'Bamba' in model:
+        example_prompts.pop(6)
+        example_prompts.pop(3)
+        example_prompts.pop(2)
+        dtype = "half"  # use a different dtype for Bamba
    with hf_runner(
            model,
@@ -147,7 +158,7 @@ def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [15])
 def test_parallel_sampling(
    vllm_runner,
@@ -251,17 +262,17 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
    dtype: str,
    example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba inner state management doesn't
+    # This test is for verifying that the hybrid inner state management doesn't
    # collapse in case where the number of incoming requests and
    # finished_requests_ids is larger than the maximum mamba block capacity.
-    # This could generally happen due to the fact that Jamba does support
+    # This could generally happen due to the fact that hybrid does support
    # statelessness mechanism where it can cleanup new incoming requests in
    # a single step.
    try:
        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
    except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up properly between"
+        pytest.fail("Hybrid inner state wasn't cleaned up properly between"
                    "steps finished requests registered unnecessarily ")
@@ -273,14 +284,14 @@ def test_state_cleanup(
    dtype: str,
    example_prompts,
 ) -> None:
-    # This test is for verifying that the Jamba state is cleaned up between
+    # This test is for verifying that the Hybrid state is cleaned up between
    # steps, If its not cleaned, an error would be expected.
    try:
        with vllm_runner(model, dtype=dtype) as vllm_model:
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:
-        pytest.fail("Jamba inner state wasn't cleaned up between states, "
+        pytest.fail("Hybrid inner state wasn't cleaned up between states, "
                    "could be related to finished_requests_ids")
@@ -326,7 +337,7 @@ def test_multistep_correctness(vllm_runner, model: str, dtype: str,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [64])
-def test_jamba_distributed_produces_identical_generation(
+def test_hybrid_distributed_produces_identical_generation(
        vllm_runner, model: str, dtype: str, max_tokens: int,
        example_prompts) -> None:

--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -5,6 +5,7 @@ Run `pytest tests/models/test_mamba.py`.
 """
 import os
 import pytest
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from vllm.engine.arg_utils import EngineArgs
@@ -13,7 +14,14 @@ from vllm.sampling_params import SamplingParams
 from ...utils import check_outputs_equal
 from ....utils import models_path_prefix
-MODELS = [os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"), os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev")]
+MODELS = [
+    os.path.join(models_path_prefix, "state-spaces/mamba-130m-hf"),
+    os.path.join(models_path_prefix, "tiiuae/falcon-mamba-tiny-dev"),
+    # TODO: Compare to a Mamba2 model. The HF transformers implementation of
+    # Mamba2 is buggy for Codestral as it doesn't handle n_groups.
+    # See https://github.com/huggingface/transformers/pull/35943
+    # "mistralai/Mamba-Codestral-7B-v0.1",
+]
 # Use lower-level interfaces to create this greedy generator, as mamba will
@@ -23,6 +31,10 @@ def generate_greedy(model_name, example_prompts, max_tokens):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
+    # Set the device (GPU if available, else CPU)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
    # Generate texts from the prompts
    outputs = []
    for prompt in example_prompts:
@@ -31,7 +43,9 @@ def generate_greedy(model_name, example_prompts, max_tokens):
        input_ids = inputs["input_ids"].to(model.device)
        # Generate text using the model's generate method directly
-        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_ids = model.generate(input_ids,
+                                       max_new_tokens=max_tokens,
+                                       do_sample=False)
        generated_text = tokenizer.decode(generated_ids[0],
                                          skip_special_tokens=True)
@@ -52,7 +66,8 @@ def test_models(
 ) -> None:
    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Set max_num_seqs to keep Codestral from going OOM at fp32
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
        # This test is for verifying whether the model's extra_repr
@@ -83,7 +98,7 @@ def test_batching(
 ) -> None:
    # To pass the small model tests, we need full precision.
    for_loop_outputs = []
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        for prompt in example_prompts:
            for_loop_outputs.append(
                vllm_model.generate_greedy([prompt], max_tokens)[0])
@@ -167,20 +182,22 @@ def test_parallel_sampling(
    max_tokens: int,
 ) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    # Numerical differences produce slightly different output for these
+    if 'state-spaces' in model:
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+        example_prompts.pop(0)
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        for_loop_outputs = []
        for _ in range(10):
            for_loop_outputs.append(
-                # using example_prompts index 1 instead of 0 since with 0 the
+                vllm_model.generate_greedy(example_prompts, max_tokens)[0])
-                # logprobs get really close and the test doesn't pass
-                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
-                [0])
        sampling_params = SamplingParams(n=10,
                                         temperature=0.001,
                                         seed=0,
                                         max_tokens=max_tokens)
-        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
+        n_lt_1_outputs = vllm_model.generate(example_prompts, sampling_params)
-                                             sampling_params)
    token_ids, texts = n_lt_1_outputs[0]
    n_lt_1_outputs = [(token_id, text)
                      for token_id, text in zip(token_ids, texts)]
@@ -234,7 +251,7 @@ def test_models_preemption_recompute(
    # Tests that outputs are identical with and w/o preemtions (recompute)
    assert dtype == "float"
-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
        vllm_model.model.llm_engine.scheduler[
            0].ENABLE_ARTIFICIAL_PREEMPT = True
        preempt_vllm_outputs = vllm_model.generate_greedy(
@@ -285,7 +302,7 @@ def test_state_cleanup(
    # This test is for verifying that the Mamba state is cleaned up between
    # steps, If its not cleaned, an error would be expected.
    try:
-        with vllm_runner(model, dtype=dtype) as vllm_model:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=16) as vllm_model:
            for _ in range(10):
                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
    except ValueError:

--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -28,6 +28,9 @@ from ....utils import models_path_prefix
            os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"),  # gemma
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
+        pytest.param(
+            os.path.join(models_path_prefix, "THUDM/chatglm3-6b"),  # chatglm (text-only)
+        ),
        pytest.param(
            os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),  # llama
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
@@ -45,6 +48,9 @@ from ....utils import models_path_prefix
            os.path.join(models_path_prefix, "microsoft/phi-2"),  # phi
            marks=[pytest.mark.core_model],
        ),
+        pytest.param(
+            os.path.join(models_path_prefix, "Qwen/Qwen-7B"),  # qwen (text-only)
+        ),
        pytest.param(
            os.path.join(models_path_prefix, "Qwen/Qwen2.5-0.5B-Instruct"),  # qwen2
            marks=[pytest.mark.core_model],
@@ -70,6 +76,10 @@ def test_models(
 ) -> None:
    with hf_runner(model, dtype=dtype) as hf_model:
+        if model.startswith("THUDM/chatglm3"):
+            hf_model.model.get_output_embeddings = lambda: \
+                hf_model.model.transformer.output_layer
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -157,10 +157,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        marks=[pytest.mark.skipif(
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
-                TRANSFORMERS_VERSION < "4.49.0",
-                reason="HF model requires transformers>=4.49.0",
-            ), pytest.mark.core_model, pytest.mark.cpu_model],
    ),
    #### Extended model tests
    "aria": VLMTestInfo(
@@ -217,7 +214,6 @@ VLM_TEST_SETTINGS = {
            "cherry_blossom": "<image>\nPlease infer the season with reason in details.",   # noqa: E501
        }),
        multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?",    # noqa: E501
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}},  # noqa: E501
        patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
        postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
        hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
@@ -353,7 +349,6 @@ VLM_TEST_SETTINGS = {
        postprocess_inputs=model_utils.cast_dtype_post_processor(
            "pixel_values"
        ),
-        vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}},  # noqa: E501
        get_stop_token_ids=lambda tok: [128009],
        auto_cls=AutoModelForVision2Seq,
        vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
@@ -406,11 +401,10 @@ VLM_TEST_SETTINGS = {
    "molmo": VLMTestInfo(
        models=["allenai/Molmo-7B-D-0924"],
        test_type=(VLMTestType.IMAGE),
-        prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501
+        prompt_formatter=identity,
        max_model_len=4096,
        max_num_seqs=2,
-        image_size_factors=[(),(1.0, 1.0, 1.0)],
+        patch_hf_runner=model_utils.molmo_patch_hf_runner,
-        patch_hf_runner=model_utils.mlomo_patch_hf_runner,
        postprocess_inputs=model_utils.molmo_post_processor,
    ),
    # Tests for phi3v currently live in another file because of a bug in
@@ -440,7 +434,7 @@ VLM_TEST_SETTINGS = {
        auto_cls=AutoModelForVision2Seq,
        marks=[large_gpu_mark(min_gb=48)],
    ),
-    "qwen": VLMTestInfo(
+    "qwen_vl": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "Qwen/Qwen-VL")],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=identity,

--- a/tests/models/decoder_only/vision_language/vlm_utils/core.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py
@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import torch
 from PIL.Image import Image
-from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
+from transformers import BatchEncoding
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm.config import TaskOption
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from .....conftest import HfRunner, VllmRunner
+from ....registry import HF_EXAMPLE_MODELS
 from .types import RunnerOutput
@@ -31,10 +33,8 @@ def run_test(
    use_tokenizer_eos: bool,
    postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
    comparator: Callable[..., None],
-    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
-                                          List[int]]],
    stop_str: Optional[List[str]],
-    tokenizer_mode: str,
    limit_mm_per_prompt: Dict[str, int],
    vllm_runner_kwargs: Optional[Dict[str, Any]],
    hf_model_kwargs: Optional[Dict[str, Any]],
@@ -48,7 +48,10 @@ def run_test(
    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
    # In the case of embeddings, vLLM takes separate input tensors
    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
-    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
+    model_info.check_available_online(on_fail="skip")
+    model_info.check_transformers_version(on_fail="skip")
    vllm_outputs_per_mm = []
    hf_outputs_per_mm = []
@@ -57,17 +60,19 @@ def run_test(
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
-    vllm_kwargs: Dict[str, Any] = {}
-    if get_stop_token_ids is not None:
-        vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
-    if stop_str:
-        vllm_kwargs["stop"] = stop_str
-    if vllm_runner_kwargs is None:
+    vllm_runner_kwargs_: Dict[str, Any] = {}
-        vllm_runner_kwargs = {}
+    if model_info.tokenizer:
+        vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
+    if model_info.tokenizer_mode:
+        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
+    if model_info.hf_overrides:
+        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+    if vllm_runner_kwargs:
+        vllm_runner_kwargs_.update(vllm_runner_kwargs)
    with vllm_runner(model,
-                     tokenizer_mode=tokenizer_mode,
                     max_model_len=max_model_len,
                     max_num_seqs=max_num_seqs,
                     dtype=dtype,
@@ -76,7 +81,15 @@ def run_test(
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=enforce_eager,
                     task=task,
-                     **vllm_runner_kwargs) as vllm_model:
+                     **vllm_runner_kwargs_) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        vllm_kwargs: Dict[str, Any] = {}
+        if get_stop_token_ids is not None:
+            vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
+        if stop_str:
+            vllm_kwargs["stop"] = stop_str
        for prompts, media in vllm_inputs:
            vllm_kwargs[runner_mm_key] = media
            vllm_output = vllm_model.generate_greedy_logprobs(
@@ -93,16 +106,19 @@ def run_test(
    if patch_hf_runner is not None:
        hf_model = patch_hf_runner(hf_model)
-    # Some models need to explicitly pass the eos_token_id off the tokenizer or
-    # processor for a good comparison; currently assume processor/tokenizer
-    # agree on the EOS, and pull it off the tokenizer if requested.
-    hf_kwargs = {}
-    if use_tokenizer_eos:
-        hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
-    if stop_str:
-        hf_kwargs["stop_strings"] = stop_str
    with hf_model, torch.no_grad():
+        tokenizer = hf_model.tokenizer
+        # Some models need to explicitly pass the eos_token_id off the tokenizer
+        # or processor for a good comparison;
+        # currently assume processor/tokenizer agree on the EOS, and pull it off
+        # the tokenizer if requested.
+        hf_kwargs = {}
+        if use_tokenizer_eos:
+            hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
+        if stop_str:
+            hf_kwargs["stop_strings"] = stop_str
        for prompts, media in inputs:
            hf_kwargs[runner_mm_key] = media
            hf_output = hf_model.generate_greedy_logprobs_limit(

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -6,7 +6,7 @@ typically specific to a small subset of models.
 import re
 import types
 from pathlib import PosixPath
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 import torch
 from PIL.Image import Image
@@ -17,9 +17,7 @@ from vllm.sequence import SampleLogprobs
 from vllm.transformers_utils.tokenizer import patch_padding_side
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
-from .....conftest import (HfRunner, ImageAsset, PromptAudioInput,
+from .....conftest import HfRunner, ImageAsset, _ImageAssets
-                           PromptImageInput, PromptVideoInput, _ImageAssets)
-from ....utils import TokensTextLogprobs
 from .types import RunnerOutput
@@ -522,74 +520,7 @@ def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model
-def _generate_greedy_logprobs_limit(
+def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
-    self,
-    prompts: List[str],
-    max_tokens: int,
-    num_logprobs: int,
-    images: Optional[PromptImageInput] = None,
-    audios: Optional[PromptAudioInput] = None,
-    videos: Optional[PromptVideoInput] = None,
-    **kwargs: Any,
-) -> List[TokensTextLogprobs]:
-    all_inputs = self.get_inputs(prompts,
-                                 images=images,
-                                 videos=videos,
-                                 audios=audios)
-    # Process in batches for inference.
-    if len(all_inputs):
-        input_ids_lst = []
-        images_lst = []
-        images_input_idx_lst = []
-        imges_masks_lst = []
-        for inputs in all_inputs:
-            input_ids_lst.append(inputs["input_ids"])
-            images_lst.append(inputs["images"])
-            images_input_idx_lst.append(inputs["image_input_idx"])
-            imges_masks_lst.append(inputs["image_masks"])
-        batch_inputs = {}
-        batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0)
-        batch_inputs['images'] = torch.cat(images_lst, dim=0)
-        batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst,
-                                                    dim=0)
-        batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0)
-        outputs = self.model.generate_from_batch(
-            batch=self.wrap_device(batch_inputs,
-                                   device=self.model.device.type),
-            generation_config=GenerationConfig(
-                max_new_tokens=max_tokens,
-                stop_strings="<|endoftext|>",
-                do_sample=False,
-            ),
-            tokenizer=self.tokenizer,
-            output_hidden_states=True,
-            return_dict_in_generate=True,
-        )
-    all_logprobs: List[List[Dict[int, float]]] = []
-    all_output_ids: List[List[int]] = []
-    all_output_strs: List[str] = []
-    for index in range(len(all_inputs)):
-        (
-            seq_logprobs_lst,
-            output_len,
-        ) = self._hidden_states_to_logprobs(outputs.hidden_states,
-                                            num_logprobs)
-        all_logprobs.append(seq_logprobs_lst)
-        seq_ids = outputs.sequences[index]
-        output_ids = seq_ids[-output_len:]
-        all_output_ids.append(output_ids.tolist())
-        all_output_strs.append(self.tokenizer.decode(output_ids))
-    outputs = zip(all_output_ids, all_output_strs, all_logprobs)
-    return [(output_ids, output_str, output_logprobs)
-            for output_ids, output_str, output_logprobs in outputs]
-####### Molmo-specific HuggingFace runner patchers
-def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for Molmo."""
    hf_processor = hf_model.processor
@@ -598,10 +529,23 @@ def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    hf_model.processor = _processor
-    setattr(  # noqa: B010
+    def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
-        hf_model,
+        batch = {
-        "generate_greedy_logprobs_limit",
+            k: kwargs.pop(k)
-        types.MethodType(_generate_greedy_logprobs_limit, hf_model),
+            for k in ("input_ids", "images", "image_input_idx", "image_masks")
-    )
+            if k in kwargs
+        }
+        return self.generate_from_batch(
+            batch,
+            generation_config=GenerationConfig(
+                max_new_tokens=max_new_tokens,
+                stop_strings="<|endoftext|>",
+                do_sample=do_sample,
+            ),
+            **kwargs,
+        )
+    hf_model.model.generate = types.MethodType(_generate, hf_model.model)
    return hf_model
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
 import torch
 from PIL.Image import Image
 from pytest import MarkDecorator
-from transformers import (AutoModelForCausalLM, BatchEncoding,
+from transformers import AutoModelForCausalLM, BatchEncoding
-                          PreTrainedTokenizerBase)
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 from vllm.config import TaskOption
 from vllm.sequence import SampleLogprobs
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import identity
 from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
    vllm_runner_kwargs: Optional[Dict[str, Any]] = None
    # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
+    get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
-                                          List[int]]] = None
    # Optional list of strings to stop generation, useful when stop tokens are
    # not special tokens in the tokenizer
    stop_str: Optional[List[str]] = None
@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
    marks: Optional[List[MarkDecorator]] = None
-    tokenizer_mode: str = "auto"
    def get_non_parametrized_runner_kwargs(self):
        """Returns a dictionary of expandable kwargs for items that are used
        in all test types, which are NOT used when creating the parametrized
@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
            "hf_model_kwargs": self.hf_model_kwargs,
            "stop_str": self.stop_str,
            "patch_hf_runner": self.patch_hf_runner,
-            "tokenizer_mode": self.tokenizer_mode
        }

--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -8,11 +8,11 @@ import torch
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                          BatchEncoding)
+from vllm import LLM, SamplingParams
 from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                     global_force_attn_backend_context_manager)
-from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID,
+from vllm.model_executor.models.mllama import MllamaForConditionalGeneration
-                                               MllamaForConditionalGeneration)
 from vllm.multimodal.image import rescale_image_size
 from vllm.sequence import SampleLogprobs
@@ -23,6 +23,7 @@ from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
 _LIMIT_IMAGE_PER_PROMPT = 3
+MLLAMA_IMAGE_TOKEN_ID = 128256
 LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
@@ -398,6 +399,64 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
        )
+@large_gpu_test(min_gb=48)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+def test_explicit_implicit_prompt(
+    image_assets: _ImageAssets,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+):
+    stop_sign = image_assets[0].pil_image
+    # yapf: disable
+    prompts = [
+        # explicit prompt
+        {
+            "encoder_prompt": {
+                "prompt": "<|image|>",
+                "multi_modal_data": {"image": stop_sign},
+            },
+            "decoder_prompt": {
+                "prompt_token_ids": [128000, 791, 2262, 315, 279, 2217, 220, 128256, 374],  # noqa: E501
+            }
+        },
+        {
+            "encoder_prompt": "Not <|image|>",
+            "decoder_prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+        # implicit prompt
+        {
+            "prompt": "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501
+            "multi_modal_data": {"image": stop_sign},
+        },
+        {
+            "prompt": "The color of the sky is blue but sometimes it can also be",  # noqa: E501
+        },
+    ]
+    # yapf: enable
+    llm = LLM(
+        model=model,
+        dtype=dtype,
+        max_model_len=4096,
+        max_num_seqs=2,
+        tensor_parallel_size=1,
+        enforce_eager=True,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        max_tokens=max_tokens,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    n_prompts = len(prompts)
+    explicit_outputs = outputs[:n_prompts // 2]
+    implicit_outputs = outputs[n_prompts // 2:]
+    for exp_output, imp_output in zip(explicit_outputs, implicit_outputs):
+        assert exp_output.outputs[0].text == imp_output.outputs[0].text
 @large_gpu_test(min_gb=48)
 @pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
@@ -460,6 +519,10 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
                                            images=images)
+class DummyModel:
+    image_token_id = MLLAMA_IMAGE_TOKEN_ID
 @pytest.mark.core_model
 @pytest.mark.parametrize(
    "input_indices_and_output",
@@ -501,7 +564,7 @@ def test_get_cross_attention_mask(input_indices_and_output) -> None:
        use_cuda_graph=False,
    )
-    dummy: dict[str, str] = {}
+    dummy = DummyModel()
    cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\
        .get_cross_attention_mask(dummy,
@@ -558,7 +621,7 @@ def test_get_full_text_row_masked_out_mask(input_indices) -> None:
        use_cuda_graph=False,
    )
-    dummy: dict[str, str] = {}
+    dummy = DummyModel()
    full_text_row_masked_out_mask = MllamaForConditionalGeneration\
        .get_full_text_row_masked_out_mask(dummy,

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -10,7 +10,7 @@ from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.processing import ProcessingCache
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -42,10 +42,7 @@ def _test_processing_correctness(
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
    ctx = InputProcessingContext(
        model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(model_config),
-            model_config.tokenizer,
-            trust_remote_code=model_info.trust_remote_code,
-        ),
    )
    # Ensure that it can fit all of the data
    cache = ProcessingCache(capacity=1 << 30)
@@ -85,11 +82,19 @@ def _test_processing_correctness(
        partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
    }
+    tokenizer_encode_kwargs = {}
+    if model_config.hf_config.model_type == "mllama":
+        # For Mllama, tokenizer will always add bos_token at the beginning of
+        # prompt by default, causing hf_processor outputs incorrect token ids.
+        # So we need use `add_special_tokens=False` here to leave bos_token
+        # to be added by the processor.
+        tokenizer_encode_kwargs = {"add_special_tokens": False}
    for batch_idx in range(num_batches):
        mm_data = {
            k:
            [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
-             for _ in range(rng.randint(limit))]
+             for _ in range(rng.randint(limit + 1))]
            for k, limit in limit_mm_per_prompt.items()
        }
@@ -122,7 +127,7 @@ def _test_processing_correctness(
            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
        baseline_tokenized_result = baseline_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
@@ -131,7 +136,7 @@ def _test_processing_correctness(
            f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
        cached_tokenized_result = cached_processor.apply(
-            tokenizer.encode(prompt),
+            tokenizer.encode(prompt, **tokenizer_encode_kwargs),
            mm_data=mm_data,
            hf_processor_mm_kwargs={},
        )
@@ -147,6 +152,7 @@ def _test_processing_correctness(
    "facebook/chameleon-7b",
    "deepseek-ai/deepseek-vl2-tiny",
    "adept/fuyu-8b",
+    "THUDM/glm-4v-9b",
    "h2oai/h2ovl-mississippi-800m",
    "OpenGVLab/InternVL2-1B",
    "HuggingFaceM4/Idefics3-8B-Llama3",
@@ -154,16 +160,19 @@ def _test_processing_correctness(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mistral-community/pixtral-12b",
    "openbmb/MiniCPM-o-2_6",
    "openbmb/MiniCPM-V-2_6",
+    "allenai/Molmo-7B-D-0924",
+    "allenai/Molmo-7B-O-0924",
    "nvidia/NVLM-D-72B",
    "Qwen/Qwen-VL-Chat",
    "Qwen/Qwen2-VL-2B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct",
    "Qwen/Qwen2-Audio-7B-Instruct",
-    "fixie-ai/ultravox-v0_3",
+    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])

--- a/tests/models/multimodal/processing/test_h2ovl.py
+++ b/tests/models/multimodal/processing/test_h2ovl.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for H2OVL's multimodal preprocessing kwargs."""
-from typing import Optional
+from typing import Mapping, Optional
 import pytest
+from PIL import Image
+from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import rescale_image_size
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+    width, height = image.size
+    # Calculate the expected number of blocks
+    if num_imgs == 1 and config.use_msac:
+        # First pass
+        blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=1,
+                max_num=max_num,
+                prior_aspect_ratio=None,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,  # Thumbnail is handled separately
+        )
+        # Second pass
+        blocks2, _, _, _ = calculate_h2ovl_targets(
+            orig_width=width,
+            orig_height=height,
+            target_ratios=get_h2ovl_target_ratios(
+                min_num=3,
+                max_num=max_num,
+                prior_aspect_ratio=aspect_ratio,
+            ),
+            image_size=config.vision_config.image_size,
+            use_thumbnail=False,
+        )
+        # Add thumbnail if use_thumbnail is True and total_blocks > 1
+        if config.use_thumbnail:
+            blocks1 += 1 if blocks1 > 1 else 0
+            blocks2 += 1 if blocks2 > 1 else 0
+        # Total blocks is the sum of blocks from both passes minus
+        # overlapping
+        total_blocks = blocks1 + blocks2 - 1
+        return total_blocks
+    blocks, _, _, _ = calculate_h2ovl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_h2ovl_target_ratios(
+            min_num,
+            max_num,
+            prior_aspect_ratio=None,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+    return expected_num_patches
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+    mm_data = {"image": images}
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+    processed_inputs = processor.apply("<image>" * len(images), mm_data,
+                                       mm_processor_kwargs)
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
 @pytest.mark.parametrize("model_id", [
    "h2oai/h2ovl-mississippi-800m",
    "h2oai/h2ovl-mississippi-2b",
@@ -25,118 +126,54 @@ from ...utils import build_model_context
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
    ],
 )
-@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
 @pytest.mark.parametrize("dynamic_image_size", [True, False])
-@pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
    model_id: str,
    image_assets: _ImageAssets,
    size_factors: list[int],
+    min_dynamic_patch: int,
    max_dynamic_patch: int,
    dynamic_image_size: Optional[bool],
-    num_imgs: int,
+    kwargs_on_init: bool,
 ):
-    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+    mm_processor_kwargs = {
-                                                  get_h2ovl_target_ratios)
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
    ctx = build_model_context(
        model_name=model_id,
        tokenizer_name=model_id,
        trust_remote_code=True,
-        mm_processor_kwargs=None,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
-        limit_mm_per_prompt={"image": num_imgs},
+        limit_mm_per_prompt={"image": len(size_factors)},
-    )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
    )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,
    )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
-    config = processor.info.get_hf_config()
+    min_num = min_dynamic_patch if dynamic_image_size else 1
-    use_msac = config.use_msac
-    mm_processor_kwargs = {
-        "max_dynamic_patch": max_dynamic_patch,
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
-    min_num = config.min_dynamic_patch
    max_num = max_dynamic_patch if dynamic_image_size else 1
-    # Build the image str / prompt based on the number of images we pass
+    _run_check(
-    prompt = "<image>" * num_imgs
+        processor,
+        [
-    for asset in image_assets:
+            rescale_image_size(image_assets[0].pil_image, f)
-        for factor in size_factors:
+            for f in size_factors
-            image = rescale_image_size(asset.pil_image, factor)
+        ],
-            mm_data = {"image": [image] * num_imgs}
+        min_num,
+        max_num,
-            width, height = image.size
+        hf_processor_mm_kwargs,
+    )
-            # Calculate the expected number of blocks
-            if num_imgs == 1 and use_msac:
-                # First pass
-                blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=None,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,  # Thumbnail is handled separately
-                )
-                # Second pass
-                blocks2, _, _, _ = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=aspect_ratio,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,
-                )
-                # Add thumbnail if use_thumbnail is True and total_blocks > 1
-                if config.use_thumbnail:
-                    blocks1 += 1 if blocks1 > 1 else 0
-                    blocks2 += 1 if blocks2 > 1 else 0
-                # Total blocks is the sum of blocks from both passes minus
-                # overlapping
-                total_blocks = blocks1 + blocks2 - 1
-                expected_num_patches = total_blocks
-            else:
-                blocks, _, _, _ = calculate_h2ovl_targets(
-                    orig_width=width,
-                    orig_height=height,
-                    target_ratios=get_h2ovl_target_ratios(
-                        min_num,
-                        max_num,
-                        prior_aspect_ratio=None,
-                    ),
-                    image_size=config.vision_config.image_size,
-                    use_thumbnail=False,
-                )
-                expected_num_patches = blocks
-                if config.use_thumbnail and expected_num_patches != 1:
-                    expected_num_patches += 1
-            processed_inputs = processor.apply(prompt, mm_data,
-                                               mm_processor_kwargs)
-            pixel_shape = (
-                processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
-            assert pixel_shape[0] == expected_num_patches * num_imgs
--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
@@ -5,7 +5,7 @@ import pytest
 from transformers import Idefics3Config
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -24,9 +24,15 @@ models = [os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3")]
    ])
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
-def test_processor_override(image_assets: _ImageAssets, model: str,
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
-                            mm_processor_kwargs: dict[str, object],
+def test_processor_override(
-                            expected_toks_per_img: int, num_imgs: int):
+    image_assets: _ImageAssets,
+    model: str,
+    mm_processor_kwargs: dict[str, object],
+    expected_toks_per_img: int,
+    num_imgs: int,
+    kwargs_on_init: bool,
+):
    """Ensure input_processor_for_idefics3 handles num_crops properly."""
    # Same as the previous test - don't initialize mm_processor_kwargs
    # in this test and assume that the kwargs will be correctly expanded by
@@ -35,15 +41,15 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
        model_name=model,
        tokenizer_name=model,
        trust_remote_code=True,
-        mm_processor_kwargs=None,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,
    )
-    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
    # Build the image str / prompt based on the number of images we pass
    placeholders = "<image>" if num_imgs == 1 else "\n".join(
@@ -56,8 +62,10 @@ def test_processor_override(image_assets: _ImageAssets, model: str,
    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
    mm_data = {"image": [dummy_image] * num_imgs}
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
    # Ensure the placeholders format are correct
+    hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
        "input_ids"][0]

--- a/tests/models/multimodal/processing/test_internvl.py
+++ b/tests/models/multimodal/processing/test_internvl.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for InternVL's multimodal preprocessing kwargs."""
-from typing import Optional
+from typing import Mapping, Optional
 import os
 import pytest
+from PIL import Image
+from transformers import PretrainedConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
 from ....utils import models_path_prefix
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets, get_internvl_target_ratios)
+    width, height = image.size
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.vision_config.image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+    return expected_num_patches
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+    mm_data = {"image": images}
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+    processed_inputs = processor.apply("<image>" * len(images), mm_data,
+                                       mm_processor_kwargs)
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
 @pytest.mark.parametrize("model_id", [os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B")])
-@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
+@pytest.mark.parametrize(
-@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+    "size_factors",
-@pytest.mark.parametrize("num_imgs", [1, 2])
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
    model_id: str,
    image_assets: _ImageAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
    max_dynamic_patch: int,
    dynamic_image_size: Optional[bool],
-    num_imgs: int,
+    kwargs_on_init: bool,
 ):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
    ctx = build_model_context(
        model_name=model_id,
        tokenizer_name=model_id,
        trust_remote_code=True,
-        mm_processor_kwargs=None,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
-        limit_mm_per_prompt={"image": num_imgs},
+        limit_mm_per_prompt={"image": len(size_factors)},
-    )
-    tokenizer = cached_get_tokenizer(
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
    )
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,
    )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
-    mm_processor_kwargs = {
+    min_num = min_dynamic_patch if dynamic_image_size else 1
-        "max_dynamic_patch": max_dynamic_patch,
+    max_num = max_dynamic_patch if dynamic_image_size else 1
-    }
-    if dynamic_image_size is not None:
-        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
-    # Build the image str / prompt based on the number of images we pass
+    _run_check(
-    prompt = "<image>" * num_imgs
+        processor,
-    image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
+        [
-    mm_data = {"image": [image] * num_imgs}
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+        ],
-    if dynamic_image_size is False:
+        min_num,
-        expected_num_patches = 1
+        max_num,
+        hf_processor_mm_kwargs,
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    )
-    # Ensure we have the right number of placeholders per num_crops size
-    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
-    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
-    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
-    assert img_tok_count == 256 * expected_num_patches * num_imgs
-    assert pixel_shape[0] == expected_num_patches * num_imgs
--- a/tests/models/multimodal/processing/test_llava_next.py
+++ b/tests/models/multimodal/processing/test_llava_next.py
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ...utils import build_model_context
@@ -43,10 +43,7 @@ def test_processor_max_tokens(model_id):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    info = processor.info
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -179,10 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    seen_aspect_ratios = set[float]()

--- a/tests/models/multimodal/processing/test_llava_onevision.py
+++ b/tests/models/multimodal/processing/test_llava_onevision.py
@@ -10,7 +10,7 @@ from pqdm.threads import pqdm
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.parse import ImageSize
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ...utils import build_model_context
@@ -44,10 +44,7 @@ def test_processor_max_tokens(model_id):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    info = processor.info
@@ -146,10 +143,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
@@ -180,10 +174,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
-        tokenizer=cached_get_tokenizer(
+        tokenizer=cached_tokenizer_from_config(ctx.model_config),
-            ctx.model_config.tokenizer,
-            trust_remote_code=ctx.model_config.trust_remote_code,
-        ),
    )
    seen_aspect_ratios = set[float]()

--- a/tests/models/multimodal/processing/test_phi3v.py
+++ b/tests/models/multimodal/processing/test_phi3v.py
@@ -3,7 +3,7 @@
 import pytest
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.utils import cached_get_tokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -21,12 +21,14 @@ from ...utils import build_model_context
    ])
 # yapf: enable
 @pytest.mark.parametrize("num_imgs", [1, 2])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
 def test_processor_override(
    image_assets: _ImageAssets,
    model_id: str,
    mm_processor_kwargs: dict[str, int],
    expected_toks_per_img: int,
    num_imgs: int,
+    kwargs_on_init: bool,
 ):
    """Ensure input_processor_for_phi3v handles num_crops properly."""
    # Avoid initializing CUDA early
@@ -36,23 +38,22 @@ def test_processor_override(
        model_name=model_id,
        tokenizer_name=model_id,
        trust_remote_code=True,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": num_imgs},
    )
-    tokenizer = cached_get_tokenizer(
+    tokenizer = cached_tokenizer_from_config(ctx.model_config)
-        ctx.model_config.tokenizer,
-        trust_remote_code=ctx.model_config.trust_remote_code,
-    )
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=tokenizer,
    )
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
    # Build the image str / prompt based on the number of images we pass
    img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
    mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
-    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs)
    # Ensure we have the right number of placeholders per num_crops size
    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)