test_interleaved.py 2.68 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7

import pytest

from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
8
from vllm.multimodal.image import convert_image_mode
9
10
11
12
13
14
15
16
17
18
19
20

models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]


def base_prompt(modalities_str: str) -> str:
    return f"<|im_start|>user {modalities_str}\nDescribe what you see from these items.<|im_end|><|im_start|>assistant\n"  # noqa: E501


INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")


21
@pytest.mark.core_model
22
23
24
25
26
27
28
29
30
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
    """
    This is a simple test to check if interleaved and non-interleaved prompts
    give the same result.
    """

31
    image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
32
    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
33
    images = [image_cherry, image_stop]
34
    video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
35
36
37
38
39
40
41
42
43
44
45
46
47
48

    inputs = [
        (
            [INTERLEAVED_PROMPT],
            [images],
            [video],
        ),
        (
            [NONINTERLEAVED_PROMPT],
            [images],
            [video],
        ),
    ]

49
50
51
52
53
54
55
56
57
58
    with vllm_runner(
        model,
        runner="generate",
        dtype=dtype,
        limit_mm_per_prompt={"image": 2},
        max_model_len=32768,
        max_num_seqs=2,
        tensor_parallel_size=1,
        enforce_eager=True,
    ) as vllm_model:
59
        vllm_outputs_per_case = [
60
61
62
            vllm_model.generate_greedy(
                prompts, max_tokens, images=images, videos=videos
            )
63
64
65
66
            for prompts, images, videos in inputs
        ]

    all_results = [output[0][1] for output in vllm_outputs_per_case]
67
68
69
    outputs = [
        (total_str, total_str.find("assistant\n") + len("assistant\n"))
        for total_str in all_results
70
    ]
71
72
    prompt_lengths = [prompt_len for _, prompt_len in outputs]
    generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
73
74
75
76
77
78
79
80
81
    interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
    interleaved_output_str, noninterleaved_output_str = generated_strs

    # The two prompts are identical except for the order of modality tokens.
    assert interleaved_prompt_len == noninterleaved_prompt_len

    # The two generated strings should be different because of the
    # interleaved modality tokens.
    assert interleaved_output_str != noninterleaved_output_str