test_phi3v.py 4.58 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
5
import pytest
import torch.nn.functional as F
6
import transformers.utils
pansicheng's avatar
pansicheng committed
7
8
9
10
from PIL import Image

from vllm.assets.base import get_vllm_public_assets
from vllm.assets.image import VLM_IMAGES_DIR
11

Cyrus Leung's avatar
Cyrus Leung committed
12
13
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
14
from ...utils import check_embeddings_close
15

16
17
18
19
20
21
# BC for method that was deleted in Transformers v5.
# Only needed for generating the HF reference.
transformers.utils.is_flash_attn_greater_or_equal_2_10 = (
    lambda: transformers.utils.is_flash_attn_greater_or_equal("2.1.0")
)

Cyrus Leung's avatar
Cyrus Leung committed
22
23
24
25
26
27
28
HF_TEXT_PROMPTS = [
    # T -> X
    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
    # T -> X
    "Retrieve an image of this caption: cherry blossom",
]

29
30
31
32
33
34
35
36
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
    {
        # T + I -> X
        "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
        # I -> X
        "cherry_blossom": "<|image_1|> Represent the given image for classification",  # noqa: E501
    }
)
37
38
39
40

MODELS = ["TIGER-Lab/VLM2Vec-Full"]


Cyrus Leung's avatar
Cyrus Leung committed
41
def _run_test(
42
43
44
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    input_texts: list[str],
Cyrus Leung's avatar
Cyrus Leung committed
45
    input_images: PromptImageInput,
46
    model: str,
Cyrus Leung's avatar
Cyrus Leung committed
47
    *,
48
49
50
51
52
53
    dtype: str,
) -> None:
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
54
55
56
    with vllm_runner(
        model, runner="pooling", dtype=dtype, enforce_eager=True
    ) as vllm_model:
57
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
58

Cyrus Leung's avatar
Cyrus Leung committed
59
60
    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
    hf_model_kwargs = {"_attn_implementation": "eager"}
61
    with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
Cyrus Leung's avatar
Cyrus Leung committed
62
        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
63
64
65
66
67

        all_outputs = []
        for inputs in all_inputs:
            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
            outputs = hf_model.model(
68
                **hf_model.wrap_device(inputs),
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
                return_dict=True,
                output_hidden_states=True,
            )
            last_hidden_state = outputs.hidden_states[-1][0]
            reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
            pooled_output = F.normalize(reps, p=2, dim=-1)

            all_outputs.append(pooled_output.tolist())

        hf_outputs = all_outputs

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
Cyrus Leung's avatar
Cyrus Leung committed
86
87


88
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@large_gpu_test(min_gb=48)
113
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
114
115
116
117
118
119
120
121
122
123
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [
124
        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
Cyrus Leung's avatar
Cyrus Leung committed
125
    ]
pansicheng's avatar
pansicheng committed
126
    # add cases for special_tokens
127
128
129
130
131
132
133
134
135
136
137
138
    input_texts_images.append(
        (
            "\n<s><|user|>\n <|image_1|>\n\t <s>"
            "Represent the given image for classification<|end|>"
            "\n<|assistant|>\n",
            Image.open(
                get_vllm_public_assets(
                    filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
                )
            ),
        )
    )
Cyrus Leung's avatar
Cyrus Leung committed
139
140
141
142
143
144
145
146
147
148
149
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )