test_transformers.py 1.46 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm.assets.image import ImageAsset
6
from vllm.config import ModelConfig
7
8
9
from vllm.multimodal import MULTIMODAL_REGISTRY


10
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
11
12
13
14
15
16
def test_multimodal_processor(model_id):
    model_config = ModelConfig(
        model=model_id,
        model_impl="transformers",
    )

17
    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
18

19
    image_pil = ImageAsset("cherry_blossom").pil_image
20
    mm_data = {"image": image_pil}
21
    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n"  # noqa: E501
22
23
    str_processed_inputs = mm_processor.apply(
        prompt=str_prompt,
24
        mm_items=mm_processor.info.parse_mm_data(mm_data),
25
26
27
28
        hf_processor_mm_kwargs={},
    )

    ids_prompt = [
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
        151644,
        872,
        220,
        151646,
        198,
        3838,
        374,
        279,
        2213,
        315,
        419,
        2168,
        30,
        151645,
        151644,
        77091,
        198,
46
47
48
    ]
    ids_processed_inputs = mm_processor.apply(
        prompt=ids_prompt,
49
        mm_items=mm_processor.info.parse_mm_data(mm_data),
50
51
52
        hf_processor_mm_kwargs={},
    )

53
54
55
56
    assert (
        str_processed_inputs["prompt_token_ids"]
        == ids_processed_inputs["prompt_token_ids"]
    )