test_internvl.py 3.92 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Tests for InternVL's multimodal preprocessing kwargs."""
4
5
from collections.abc import Mapping
from typing import Optional
6

zhuwenwen's avatar
zhuwenwen committed
7
import os
8
import pytest
9
10
from PIL import Image
from transformers import PretrainedConfig
11

12
from vllm.multimodal import MULTIMODAL_REGISTRY
13
14
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.processing import BaseMultiModalProcessor
15

16
from ....conftest import ImageTestAssets
17
from ...utils import build_model_context
zhuwenwen's avatar
zhuwenwen committed
18
from ....utils import models_path_prefix
19
20


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
def _get_expected_num_patches(
    config: PretrainedConfig,
    image: Image.Image,
    num_imgs: int,
    min_num: int,
    max_num: int,
):
    from vllm.model_executor.models.internvl import (
        calculate_internvl_targets, get_internvl_target_ratios)

    width, height = image.size

    blocks, _, _ = calculate_internvl_targets(
        orig_width=width,
        orig_height=height,
        target_ratios=get_internvl_target_ratios(
            min_num,
            max_num,
        ),
        image_size=config.vision_config.image_size,
        use_thumbnail=False,
    )
    expected_num_patches = blocks

    if config.use_thumbnail and expected_num_patches > 1:
        expected_num_patches += 1

    return expected_num_patches


def _run_check(
    processor: BaseMultiModalProcessor,
    images: list[Image.Image],
    min_num: int,
    max_num: int,
    mm_processor_kwargs: Mapping[str, object],
):
    tokenizer = processor.info.get_tokenizer()
    config = processor.info.get_hf_config()

61
    prompt = "<image>" * len(images)
62
63
64
65
66
67
    mm_data = {"image": images}

    total_expected_num_patches = sum(
        _get_expected_num_patches(config, image, len(images), min_num, max_num)
        for image in images)

68
    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
69
70
71
72
73
74
75
76
77
78

    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape

    assert img_tok_count == 256 * total_expected_num_patches
    assert pixel_shape[0] == total_expected_num_patches


zhuwenwen's avatar
zhuwenwen committed
79
@pytest.mark.parametrize("model_id", os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"))
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@pytest.mark.parametrize(
    "size_factors",
    [
        # Single-scale
        [1.0],
        # Single-scale, batched
        [1.0, 1.0, 1.0],
        # Multi-scale
        [0.25, 0.5, 1.0],
        [4.0, 2.0, 1.0],
    ],
)
@pytest.mark.parametrize(
    ("min_dynamic_patch", "max_dynamic_patch"),
    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
)
@pytest.mark.parametrize("dynamic_image_size", [True, False])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
98
99
def test_processor_override(
    model_id: str,
100
    image_assets: ImageTestAssets,
101
102
    size_factors: list[int],
    min_dynamic_patch: int,
103
104
    max_dynamic_patch: int,
    dynamic_image_size: Optional[bool],
105
    kwargs_on_init: bool,
106
):
107
108
109
110
111
112
    mm_processor_kwargs = {
        "min_dynamic_patch": min_dynamic_patch,
        "max_dynamic_patch": max_dynamic_patch,
        "dynamic_image_size": dynamic_image_size,
    }

113
    ctx = build_model_context(
114
        model_id,
115
116
        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
        limit_mm_per_prompt={"image": len(size_factors)},
117
    )
118
    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
119
    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
120

121
122
    min_num = min_dynamic_patch if dynamic_image_size else 1
    max_num = max_dynamic_patch if dynamic_image_size else 1
123

124
125
126
127
128
129
130
131
132
    _run_check(
        processor,
        [
            rescale_image_size(image_assets[0].pil_image, f)
            for f in size_factors
        ],
        min_num,
        max_num,
        hf_processor_mm_kwargs,
zhuwenwen's avatar
zhuwenwen committed
133
    )