Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

96ae75ad · zhuwenwen · f9f4a735 · 2339d59f · 96ae75ad · 96ae75ad
Commit 96ae75ad authored Jan 04, 2025 by zhuwenwen
20 changed files
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -10,39 +10,42 @@ from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe
 def test_run(my_rank, pipe):
+    print(f"rank {my_rank} test_run starts....")
    # test run
    x = torch.tensor([1]).to(pipe.device)
    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
    if my_rank == 0:
        pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
        x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
    else:
        x2 = pipe.recv_tensor()
-        print("received x2 = ", x2)
+        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
-        print("received y2 = ", x2)
+        print(f"rank {my_rank} received y2 = ", x2)
        pipe.send_tensor(x)
-        print("sent tensor x")
+        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
-        print("sent tensor y")
+        print(f"rank {my_rank} sent tensor y")
    assert torch.allclose(x, x2)
    assert torch.allclose(y, y2)
+    print(f"rank {my_rank} test_run passed!")
-def stress_test(my_rank, pipe):
-    torch.distributed.barrier()
+def stress_test(my_rank, pipe):
+    print(f"rank {my_rank} stress_test starts....")
    tensors: List[torch.Tensor] = []
+    torch.distributed.barrier()
    torch.manual_seed(0)
    for i in tqdm(range(500)):
@@ -86,7 +89,6 @@ def stress_test(my_rank, pipe):
 def latency_test(my_rank, pipe, nelement, ntensor):
    latencies = []
    torch.distributed.barrier()
@@ -149,6 +151,7 @@ if __name__ == "__main__":
    )
    test_run(my_rank, pipe)
    stress_test(my_rank, pipe)
    # Use this function if you want to test the latency of pipe impl.

--- a/tests/kv_transfer/test_send_recv.sh
+++ b/tests/kv_transfer/test_send_recv.sh
 #!/bin/bash
 RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &
+PID0=$!
\ No newline at end of file
+RANK=1 python3 test_send_recv.py &
+PID1=$!
+wait $PID0
+wait $PID1
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -208,6 +208,11 @@ def minicpmv_lora_files():
    return snapshot_download(repo_id="jeeejeee/minicpmv25-lora-pokemon")
+@pytest.fixture(scope="session")
+def qwen2vl_lora_files():
+    return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
 @pytest.fixture(scope="session")
 def tinyllama_lora_files():
    # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")

--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -4,6 +4,7 @@ import pytest
 from vllm.lora.models import LoRAModel
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.utils import WeightsMapper
 lora_lst = [
    "baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
@@ -71,3 +72,37 @@ def test_load_checkpoints(
                device="cpu",
                embedding_modules=embedding_modules,
                embedding_padding_modules=embed_padding_modules)
+def test_lora_weights_mapping(baichuan_lora_files):
+    supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
+    packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
+    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
+    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
+    expected_lora_modules: List[str] = []
+    for module in supported_lora_modules:
+        if module in packed_modules_mapping:
+            expected_lora_modules.extend(packed_modules_mapping[module])
+        else:
+            expected_lora_modules.append(module)
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.": "language_model.model.",
+        },
+        orig_to_new_substr={
+            ".layers.": ".baichuan_layers.",
+        },
+    )
+    lora_model = LoRAModel.from_local_checkpoint(
+        baichuan_lora_files,
+        expected_lora_modules,
+        lora_model_id=1,
+        device="cpu",
+        embedding_modules=embedding_modules,
+        embedding_padding_modules=embed_padding_modules,
+        weights_mapper=hf_to_vllm_mapper,
+    )
+    for name in lora_model.loras:
+        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
+        assert ".baichuan_layers." in name
--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -69,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
        max_loras=4,
        max_lora_rank=64,
        trust_remote_code=True,
-        gpu_memory_utilization=0.97,  # This model is pretty big for CI gpus
        enable_chunked_prefill=True,
    )
    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)

--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -64,8 +64,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
 @pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size):
+                                         tp_size, fully_shard):
    """This LoRA model has all supported Mixtral target modules"""
    if torch.cuda.device_count() < tp_size:
@@ -84,6 +85,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
        max_loras=4,
        distributed_executor_backend="ray",
        tensor_parallel_size=tp_size,
+        fully_sharded_loras=fully_shard,
        max_lora_rank=32,
    )

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
+from typing import List
+import pytest
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.platforms import current_platform
+MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct"
+PROMPT_TEMPLATE = (
+    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+    "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+    "What is in the image?<|im_end|>\n"
+    "<|im_start|>assistant\n")
+IMAGE_ASSETS = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+# After fine-tuning with LoRA, all generated content should start begin `A`.
+EXPECTED_OUTPUT = [
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
+]
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    sampling_params = vllm.SamplingParams(
+        temperature=0,
+        max_tokens=5,
+    )
+    inputs = [{
+        "prompt": PROMPT_TEMPLATE,
+        "multi_modal_data": {
+            "image": asset.pil_image
+        },
+    } for asset in IMAGE_ASSETS]
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None,
+    )
+    # Print the outputs.
+    generated_texts: List[str] = []
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text.strip()
+        generated_texts.append(generated_text)
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    return generated_texts
+@pytest.mark.xfail(current_platform.is_rocm(),
+                   reason="Qwen2-VL dependency xformers incompatible with ROCm"
+                   )
+def test_qwen2vl_lora(qwen2vl_lora_files):
+    llm = vllm.LLM(
+        MODEL_PATH,
+        max_num_seqs=2,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=16,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 28 * 28,
+            "max_pixels": 1280 * 28 * 28,
+        },
+        max_model_len=4096,
+    )
+    output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output1[i])
+    output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
+    for i in range(len(EXPECTED_OUTPUT)):
+        assert EXPECTED_OUTPUT[i].startswith(output2[i])
--- a/tests/model_executor/test_guided_processors.py
+++ b/tests/model_executor/test_guided_processors.py
+import pickle
 import pytest
 import os
 import torch
 from transformers import AutoTokenizer
+from vllm.config import ModelConfig
 from vllm.model_executor.guided_decoding import (
-    get_guided_decoding_logits_processor)
+    get_guided_decoding_logits_processor,
+    get_local_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
    JSONLogitsProcessor, RegexLogitsProcessor)
 from vllm.sampling_params import GuidedDecodingParams
 from ..utils import models_path_prefix
+MODEL_NAME = os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta')
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
 def test_guided_logits_processors(sample_regex, sample_json_schema):
    """Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
@@ -39,16 +46,30 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
 @pytest.mark.asyncio
-@pytest.mark.parametrize("backend",
+@pytest.mark.parametrize("backend", GUIDED_DECODING_BACKENDS)
-                         ["outlines", "lm-format-enforcer", "xgrammar"])
+@pytest.mark.parametrize("is_local", [True, False])
-async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
+                                                 sample_regex,
                                                 sample_json_schema):
-    tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
+    config = ModelConfig(
+        MODEL_NAME,
+        task="generate",
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="bfloat16",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    token_ids = tokenizer.encode(
        f"Give an example IPv4 address with this regex: {sample_regex}")
    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-    regex_lp = await get_guided_decoding_logits_processor(
-        regex_request, tokenizer)
+    regex_lp = get_local_guided_decoding_logits_processor(
+            regex_request, tokenizer, config) if is_local else \
+            await get_guided_decoding_logits_processor(
+                    regex_request, tokenizer, config)
    assert regex_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -62,7 +83,7 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
    json_request = GuidedDecodingParams(json=sample_json_schema,
                                        backend=backend)
    json_lp = await get_guided_decoding_logits_processor(
-        json_request, tokenizer)
+        json_request, tokenizer, config)
    assert json_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -87,3 +108,24 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
    with pytest.raises(ValueError,
                       match="You can only use one kind of guided"):
        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
+def test_pickle_xgrammar_tokenizer_data():
+    # TODO: move to another test file for xgrammar
+    try:
+        import xgrammar as xgr
+    except ImportError:
+        pytest.skip("Could not import xgrammar to run test")
+    from vllm.model_executor.guided_decoding.xgrammar_decoding import (
+        TokenizerData)
+    tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
+    pickled = pickle.dumps(tokenizer_data)
+    assert pickled is not None
+    depickled: TokenizerData = pickle.loads(pickled)
+    assert depickled is not None
+    assert depickled.vocab_type == xgr.VocabType.RAW
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -7,6 +7,7 @@ import os
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
+from vllm.multimodal.audio import resample_audio
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
@@ -133,16 +134,14 @@ def run_test(
                   dtype=dtype,
                   postprocess_inputs=process,
                   auto_cls=AutoModel) as hf_model:
-        import librosa
        hf_outputs_per_audio = [
            hf_model.generate_greedy_logprobs_limit(
                [hf_prompt],
                max_tokens,
                num_logprobs=num_logprobs,
-                audios=[(librosa.resample(audio[0],
+                audios=[(resample_audio(audio[0],
-                                          orig_sr=audio[1],
+                                        orig_sr=audio[1],
-                                          target_sr=16000), 16000)])
+                                        target_sr=16000), 16000)])
            for _, hf_prompt, audio in prompts_and_audios
        ]

--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -3,19 +3,22 @@
 Run `pytest tests/models/test_mistral.py`.
 """
 import copy
+import json
+import jsonschema
+import jsonschema.exceptions
 import pytest
 import os
-from vllm import SamplingParams
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
    MistralToolParser)
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...utils import check_logprobs_close
 from ....utils import models_path_prefix
 MODELS = [
-    os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
+    os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
 ]
 MISTRAL_FORMAT_MODELS = [
@@ -128,6 +131,45 @@ MSGS = [
    }
 ]
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {
+            "type": "string"
+        },
+        "age": {
+            "type": "integer"
+        },
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+                "maxLength": 10
+            },
+            "minItems": 3
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {
+                        "type": "string"
+                    },
+                    "duration": {
+                        "type": "number"
+                    },
+                    "position": {
+                        "type": "string"
+                    }
+                },
+                "required": ["company", "position"]
+            }
+        }
+    },
+    "required": ["name", "age", "skills", "work_history"]
+}
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -253,3 +295,43 @@ def test_mistral_function_calling(
        assert parsed_message.tool_calls[
            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
        assert parsed_message.content is None
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("guided_backend",
+                         ["outlines", "lm-format-enforcer", "xgrammar"])
+def test_mistral_guided_decoding(
+    vllm_runner,
+    model: str,
+    guided_backend: str,
+) -> None:
+    with vllm_runner(model, dtype='bfloat16',
+                     tokenizer_mode="mistral") as vllm_model:
+        guided_decoding = GuidedDecodingParams(json=SAMPLE_JSON_SCHEMA,
+                                               backend=guided_backend)
+        params = SamplingParams(max_tokens=512,
+                                temperature=0.7,
+                                guided_decoding=guided_decoding)
+        messages = [{
+            "role": "system",
+            "content": "you are a helpful assistant"
+        }, {
+            "role":
+            "user",
+            "content":
+            f"Give an example JSON for an employee profile that "
+            f"fits this schema: {SAMPLE_JSON_SCHEMA}"
+        }]
+        outputs = vllm_model.model.chat(messages, sampling_params=params)
+        generated_text = outputs[0].outputs[0].text
+        json_response = json.loads(generated_text)
+        assert outputs is not None
+        try:
+            jsonschema.validate(instance=json_response,
+                                schema=SAMPLE_JSON_SCHEMA)
+        except jsonschema.exceptions.ValidationError:
+            pytest.fail("Generated response is not valid with JSON schema")
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -60,16 +60,14 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
-    "num_crops,expected_toks_per_img,num_imgs",
+    "num_crops,expected_toks_per_img",
    [
-        (4, 757, 1),
+        (4, 757),
-        (4, 757, 2),
+        (16, 1921),
-        (16, 1921, 1),
-        (16, 1921, 2),
        # the default num_crops of phi-3.5-vision is 4
-        (None, 757, 2),
+        (None, 757),
-        (None, 757, 2),
    ])
+@pytest.mark.parametrize("num_imgs", [1, 2])
 def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets,
                            model: str, num_crops: Optional[int],
                            expected_toks_per_img: int, num_imgs: int):

--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -2,12 +2,9 @@ from typing import Any, Dict, Tuple
 import os
 import pytest
-import torch
-from PIL.Image import Image
 from transformers import AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
+from vllm.inputs import InputContext, InputProcessingContext
-from vllm.multimodal import MultiModalRegistry
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -22,22 +19,9 @@ MAX_PIXELS = "max_pixels"
 # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
 # input mappers.
 @pytest.fixture()
-def image_input_mapper_for_qwen2_vl():
+def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
+    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-        image_input_mapper_for_qwen2_vl)
+    return Qwen2VLMultiModalProcessor
-    return image_input_mapper_for_qwen2_vl
-@pytest.fixture()
-def input_processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        input_processor_for_qwen2_vl)
-    return input_processor_for_qwen2_vl
-@pytest.fixture()
-def qwen2_vl_context() -> InputContext:
-    return build_model_context(model_name=MODEL)
 @pytest.fixture()
@@ -47,12 +31,6 @@ def get_max_qwen2_vl_image_tokens():
    return get_max_qwen2_vl_image_tokens
-@pytest.fixture()
-def dummy_data_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
-    return dummy_data_for_qwen2_vl
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
    ({}, 1225),
    ({
@@ -60,110 +38,70 @@ def dummy_data_for_qwen2_vl():
        MAX_PIXELS: 512**2
    }, 324),
 ])
-def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
+@pytest.mark.parametrize("model", [MODEL])
-                                   qwen2_vl_context: InputContext,
+def test_qwen2_vl_max_image_tokens(
-                                   mm_processor_kwargs: Dict[str, Any],
+    get_max_qwen2_vl_image_tokens,
-                                   expected_max_tokens: int):
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
+    expected_max_tokens: int,
+):
    """Ensure that the max token calc handles min/max pixels properly."""
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
+    ctx = build_model_context(
-                                                      **mm_processor_kwargs)
+        model_name=model,
-    assert actual_max_tokens == expected_max_tokens
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
-@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
-    [{}, 1225, (980, 980)],
-    [{
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324, (504, 504)],
-])
-def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
-                             qwen2_vl_context: InputContext,
-                             mm_processor_kwargs: Dict[str, Any],
-                             token_count: int, img_size: Tuple[int, int]):
-    """Ensure that the dummy data handles min/max pixels properly."""
-    seq_len = 3000
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-    # NOTE: video value is required, but isn't actually used
-    # when making the dummy data except for error handling currently
-    dummy_data = dummy_data_for_qwen2_vl(
-        ctx=qwen2_vl_context,
-        seq_len=seq_len,
-        mm_counts={
-            "image": 1,
-            "video": 0
-        },
-        **mm_processor_kwargs,
    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-    # Ensure we have the right number of placeholders for min/max pixel values
-    assert seq_data.get_token_ids().count(image_token_id) == token_count
-    # Ensure the images were resized correctly
+    actual_max_tokens = get_max_qwen2_vl_image_tokens(
-    image = mm_data["image"]
+        InputContext(ctx.model_config), **mm_processor_kwargs)
-    assert isinstance(image, Image)
+    assert actual_max_tokens == expected_max_tokens
-    assert image.size == img_size
-@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
+@pytest.mark.parametrize(
-    ({}, 1426),
+    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
-    ({
+        ({}, 1426, (5704, 1176)),
-        MIN_PIXELS: 64**2,
+        ({
-        MAX_PIXELS: 512**2
+            MIN_PIXELS: 64**2,
-    }, 330),
+            MAX_PIXELS: 512**2
-])
+        }, 330, (1320, 1176)),
-def test_input_processor(input_processor_for_qwen2_vl,
+    ])
-                         qwen2_vl_context: InputContext,
+@pytest.mark.parametrize("model", [MODEL])
-                         image_assets: _ImageAssets, num_placeholders: int,
+@pytest.mark.parametrize("num_imgs", [1, 2])
-                         mm_processor_kwargs: Dict[str, Any]):
+def test_processor_override(
-    """Ensure that the image processor handles min/max pixels properly."""
+    processor_for_qwen2_vl,
-    tokenizer = AutoTokenizer.from_pretrained(MODEL)
+    image_assets: _ImageAssets,
-    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
+    model: str,
+    mm_processor_kwargs: Dict[str, Any],
-    image = image_assets[0].pil_image
+    expected_toks_per_img: int,
-    hf_config = qwen2_vl_context.get_hf_config()
+    expected_pixels_shape: Tuple[int, int],
-    image_token_id = hf_config.image_token_id
+    num_imgs: int,
+):
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
-                          prompt=prompt,
+    # Same as the previous test - don't initialize mm_processor_kwargs
-                          multi_modal_data={"image": [image]})
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
-    processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
+    ctx = build_model_context(
-                                                    **mm_processor_kwargs)
+        model_name=model,
-    assert processed_inputs["prompt_token_ids"].count(
+        tokenizer_name=model,
-        image_token_id) == num_placeholders
+        mm_processor_kwargs=None,
-    assert len(processed_inputs["multi_modal_data"]["image"]) == 1
-@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
-    ({}, [5704, 1176]),
-    ({
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, [1320, 1176]),
-])
-def test_image_mapper_override(qwen2_vl_context: InputContext,
-                               image_assets: _ImageAssets,
-                               mm_processor_kwargs: Dict[str, Any],
-                               pixels_shape: Tuple[int, int]):
-    """Ensure that the image mapper handles min/max pixels properly."""
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
-    image = image_assets[0].pil_image
-    mapped_output = mm_registry.map_input(
-        qwen2_vl_context.model_config,
-        {"image": image},
-        mm_processor_kwargs=mm_processor_kwargs,
    )
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    # Dimension 0 of pixel values should match the product of image_grid_thw
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    actual_pixels_shape = mapped_output["pixel_values"].shape
+    # Build the image str / prompt based on the number of images we pass
-    assert list(actual_pixels_shape) == pixels_shape
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
-    assert actual_pixels_shape[0] == torch.prod(
+    images = [image_assets[0].pil_image] * num_imgs
-        mapped_output["image_grid_thw"])
+    mm_data = {"image": images}
+    processor = processor_for_qwen2_vl(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
--- a/tests/models/decoder_only/vision_language/test_awq.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -4,7 +4,7 @@ import os
 import pytest
 import torch
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close

--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -9,7 +9,7 @@ from transformers import AutoConfig
 # Import the functions to test
 from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
                                              image_to_pixel_values_wrapper)
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from ....utils import models_path_prefix
 models = [

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -5,7 +5,7 @@ from typing import List, Optional, Tuple, Type
 import pytest
 from transformers import AutoTokenizer
-from vllm.multimodal.utils import rescale_image_size
+from vllm.multimodal.image import rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs

--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -7,8 +7,8 @@ import torch
 from PIL import Image
 from vllm.entrypoints.llm import LLM
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+from vllm.multimodal.image import rescale_image_size
-                                   sample_frames_from_video)
+from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
 from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
                          PromptVideoInput, VllmRunner)

--- a/tests/models/decoder_only/vision_language/vlm_utils/builders.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/builders.py
@@ -5,8 +5,9 @@ from typing import Callable, Iterable, List, Optional, Tuple, Union
 import torch
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+from vllm.multimodal.image import rescale_image_size
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 from .....conftest import _ImageAssets, _VideoAssets
 from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,

--- a/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py
 """Custom input builders for edge-cases in different models."""
 from typing import Callable
-from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+from vllm.multimodal.image import rescale_image_size
-                                   resize_video, sample_frames_from_video)
+from vllm.multimodal.video import (rescale_video_size, resize_video,
+                                   sample_frames_from_video)
 from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
 from .builders import build_multi_image_inputs, build_single_image_inputs

--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
-"""Compare the outputs of HF and vLLM when using greedy sampling.
+"""Compare the classification outputs of HF and vLLM models.
-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
 Run `pytest tests/models/test_cls_models.py`.
 """

--- a/tests/models/embedding/language/test_scoring.py
+++ b/tests/models/embedding/language/test_scoring.py
-"""Compare the embedding outputs of HF and vLLM models.
+"""Compare the scoring outputs of HF and vLLM models.
-Run `pytest tests/models/embedding/language/test_embedding.py`.
+Run `pytest tests/models/embedding/language/test_scoring.py`.
 """
 import math
 import os