"vscode:/vscode.git/clone" did not exist on "59edd0f1340a85d585a477fc687a16cfc5cf5276"
test_llava_next.py 4.16 KB
Newer Older
1
2
3
import itertools
from functools import partial

4
5
import pytest
from PIL import Image
6
from pqdm.threads import pqdm
7

8
from vllm.multimodal import MULTIMODAL_REGISTRY
9
from vllm.multimodal.parse import ImageSize
10
11
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import cached_get_tokenizer
12

13
from ...utils import build_model_context
14
15


16
def _validate_image_prompt_replacements_one(
17
    processor: BaseMultiModalProcessor,
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    num_imgs: int,
    failed_size_excs: list[tuple[ImageSize, Exception]],
    image_size: ImageSize,
) -> None:
    prompt = "<image>" * num_imgs
    image = Image.new("RGB", size=image_size)
    mm_data = {"image": [image] * num_imgs}

    try:
        # The processor will throw an error if there is a mismatch
        # in the prompt replacements
        processed_inputs = processor.apply(prompt, mm_data, {})

        image_placeholders = processed_inputs["mm_placeholders"]["image"]
        assert len(image_placeholders) == num_imgs

        first_placeholder = image_placeholders[0]

        # NOTE: There is a BOS token
        assert first_placeholder["offset"] == 1
        assert first_placeholder["length"] == (
            len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs

    except Exception as exc:
        failed_size_excs.append((image_size, exc))


def _test_image_prompt_replacements(
    processor,
    *,
    num_imgs: int,
    image_sizes: list[ImageSize],
) -> None:
    """
    Ensure LlavaNextMultiModalProcessor
    handles prompt replacement properly for input images.
    """
    failed_size_excs = list[tuple[ImageSize, Exception]]()

    validate_one = partial(
        _validate_image_prompt_replacements_one,
        processor,
        num_imgs,
        failed_size_excs,
    )
    pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")

    if failed_size_excs:
        msg = "Found failing image sizes:" \
            + "\n========\n".join(f"[{size}]\n{exc}"
                                  for size, exc in failed_size_excs)
        raise AssertionError(msg)


72
73
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
74
def test_processor_prompt_replacements_regression(model_id, num_imgs):
75
76
77
78
79
80
    ctx = build_model_context(
        model_name=model_id,
        tokenizer_name=model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
81
82
83
84
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
    )
85
86
87
88
89
90
91
92
93
94
95
96
97

    image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
                    (488, 183), (2560, 1669)]
    image_sizes = [
        size for w, h in image_ratios
        for size in [ImageSize(w, h), ImageSize(h, w)]
    ]

    _test_image_prompt_replacements(
        processor,
        num_imgs=num_imgs,
        image_sizes=image_sizes,
    )
98
99


100
101
102
103
@pytest.mark.skip("This test takes around 2 hours to run. "
                  "Comment this out to run it manually.")
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
104
def test_processor_prompt_replacements_all(model_id, num_imgs):
105
106
107
108
109
110
    ctx = build_model_context(
        model_name=model_id,
        tokenizer_name=model_id,
        mm_processor_kwargs=None,
        limit_mm_per_prompt={"image": num_imgs},
    )
111
112
113
114
    processor = MULTIMODAL_REGISTRY.create_processor(
        ctx.model_config,
        tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
    )
115

116
117
    seen_aspect_ratios = set[float]()
    image_sizes = list[ImageSize]()
118

119
120
121
122
123
124
125
126
    # The aspect ratio of the grid layout is between 1 and 2
    # NOTE: Assumes that feature size calculation is the same if we
    # swap the width and height of the image
    for w, h in itertools.product(range(64, 1024), repeat=2):
        aspect_ratio = w / h
        if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
            image_sizes.append(ImageSize(w, h))
            seen_aspect_ratios.add(aspect_ratio)
127

128
129
130
131
132
    _test_image_prompt_replacements(
        processor,
        num_imgs=num_imgs,
        image_sizes=image_sizes,
    )