test_llava_onevision.py 6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
6
import itertools
from functools import partial

7
8
import pytest
from PIL import Image
9
from pqdm.threads import pqdm
10

11
from vllm.multimodal import MULTIMODAL_REGISTRY
12
from vllm.multimodal.parse import ImageSize
13
from vllm.multimodal.processing import BaseMultiModalProcessor
14

15
from ...utils import build_model_context
16
17


18
19
20
21
22
23
24
def _validate_image_max_tokens_one(
    processor: BaseMultiModalProcessor,
    max_tokens: int,
    failed_size_excs: list[tuple[ImageSize, Exception]],
    image_size: ImageSize,
) -> None:
    info = processor.info
25
26
27
    feature_size = info.get_num_image_tokens(
        image_width=image_size.width, image_height=image_size.height
    )
28
29
30
31
32
33
34

    try:
        assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
    except Exception as exc:
        failed_size_excs.append((image_size, exc))


35
36
37
38
@pytest.mark.skip(
    "This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
39
40
def test_processor_max_tokens(model_id):
    ctx = build_model_context(
41
        model_id,
42
43
44
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": 1},
    )
45
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    info = processor.info

    seen_aspect_ratios = set[float]()
    image_sizes = list[ImageSize]()

    # The aspect ratio of the grid layout is between 1 and 6
    # NOTE: Assumes that feature size calculation is the same if we
    # swap the width and height of the image
    for w, h in itertools.product(range(32, 4096), repeat=2):
        aspect_ratio = w / h
        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
            image_sizes.append(ImageSize(w, h))
            seen_aspect_ratios.add(aspect_ratio)

    failed_size_excs = list[tuple[ImageSize, Exception]]()

    validate_one = partial(
        _validate_image_max_tokens_one,
        processor,
        info.get_max_image_tokens(),  # type: ignore
        failed_size_excs,
    )
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
71
72
73
        msg = "Found failing image sizes:" + "\n========\n".join(
            f"[{size}]\n{exc}" for size, exc in failed_size_excs
        )
74
75
76
        raise AssertionError(msg)


77
def _validate_image_prompt_replacements_one(
78
    processor: BaseMultiModalProcessor,
79
80
81
82
83
84
85
86
87
88
89
    num_imgs: int,
    failed_size_excs: list[tuple[ImageSize, Exception]],
    image_size: ImageSize,
) -> None:
    prompt = "<image>" * num_imgs
    image = Image.new("RGB", size=image_size)
    mm_data = {"image": [image] * num_imgs}

    try:
        # The processor will throw an error if there is a mismatch
        # in the prompt replacements
90
        processed_inputs = processor(
91
92
93
94
            prompt,
            mm_items=processor.info.parse_mm_data(mm_data),
            hf_processor_mm_kwargs={},
        )
95
96
97
98
99
100

        image_placeholders = processed_inputs["mm_placeholders"]["image"]
        assert len(image_placeholders) == num_imgs

        first_placeholder = image_placeholders[0]

101
        assert first_placeholder.offset == 0
102
103
104
105
        assert (
            first_placeholder.length
            == len(processed_inputs["prompt_token_ids"]) // num_imgs
        )
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
    except Exception as exc:
        failed_size_excs.append((image_size, exc))


def _test_image_prompt_replacements(
    processor,
    *,
    num_imgs: int,
    image_sizes: list[ImageSize],
) -> None:
    """
    Ensure LlavaOnevisionMultiModalProcessor
    handles prompt replacement properly for input images.
    """
    failed_size_excs = list[tuple[ImageSize, Exception]]()

    validate_one = partial(
        _validate_image_prompt_replacements_one,
        processor,
        num_imgs,
        failed_size_excs,
    )
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
131
132
133
        msg = "Found failing image sizes:" + "\n========\n".join(
            f"[{size}]\n{exc}" for size, exc in failed_size_excs
        )
134
135
136
        raise AssertionError(msg)


137
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
138
@pytest.mark.parametrize("num_imgs", [1, 2])
139
def test_processor_prompt_replacements_regression(model_id, num_imgs):
140
    ctx = build_model_context(
141
        model_id,
142
143
144
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
145
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
146

147
148
149
150
151
152
153
154
155
    image_ratios = [
        (171, 152),
        (184, 161),
        (198, 176),
        (333, 296),
        (369, 328),
        (488, 183),
        (2560, 1669),
    ]
156
    image_sizes = [
157
        size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
158
159
160
161
162
163
164
    ]

    _test_image_prompt_replacements(
        processor,
        num_imgs=num_imgs,
        image_sizes=image_sizes,
    )
165

166

167
168
169
170
@pytest.mark.skip(
    "This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
171
@pytest.mark.parametrize("num_imgs", [1])
172
def test_processor_prompt_replacements_all(model_id, num_imgs):
173
    ctx = build_model_context(
174
        model_id,
175
176
177
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
178
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
179

180
181
    seen_aspect_ratios = set[float]()
    image_sizes = list[ImageSize]()
182

183
184
185
186
187
188
189
190
    # The aspect ratio of the grid layout is between 1 and 6
    # NOTE: Assumes that feature size calculation is the same if we
    # swap the width and height of the image
    for w, h in itertools.product(range(64, 1024), repeat=2):
        aspect_ratio = w / h
        if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
            image_sizes.append(ImageSize(w, h))
            seen_aspect_ratios.add(aspect_ratio)
191

192
193
194
195
196
    _test_image_prompt_replacements(
        processor,
        num_imgs=num_imgs,
        image_sizes=image_sizes,
    )