test_phi3v.py 3.77 KB
Newer Older
Cyrus Leung's avatar
Cyrus Leung committed
1
2
from typing import List, Type

3
4
5
import pytest
import torch.nn.functional as F

Cyrus Leung's avatar
Cyrus Leung committed
6
7
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
8
9
from ..utils import check_embeddings_close

Cyrus Leung's avatar
Cyrus Leung committed
10
11
12
13
14
15
16
HF_TEXT_PROMPTS = [
    # T -> X
    "Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501
    # T -> X
    "Retrieve an image of this caption: cherry blossom",
]

17
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
Cyrus Leung's avatar
Cyrus Leung committed
18
    # T + I -> X
19
20
    "stop_sign":
    "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign",  # noqa: E501
Cyrus Leung's avatar
Cyrus Leung committed
21
    # I -> X
22
    "cherry_blossom":
Cyrus Leung's avatar
Cyrus Leung committed
23
    "<|image_1|> Represent the given image for classification",  # noqa: E501
24
25
26
27
28
})

MODELS = ["TIGER-Lab/VLM2Vec-Full"]


Cyrus Leung's avatar
Cyrus Leung committed
29
30
31
32
33
def _run_test(
    hf_runner: Type[HfRunner],
    vllm_runner: Type[VllmRunner],
    input_texts: List[str],
    input_images: PromptImageInput,
34
    model: str,
Cyrus Leung's avatar
Cyrus Leung committed
35
    *,
36
37
38
39
40
41
    dtype: str,
) -> None:
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
Cyrus Leung's avatar
Cyrus Leung committed
42
    with vllm_runner(model, task="embedding", dtype=dtype,
43
                     enforce_eager=True) as vllm_model:
Cyrus Leung's avatar
Cyrus Leung committed
44
        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
45

Cyrus Leung's avatar
Cyrus Leung committed
46
47
48
49
50
    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
    hf_model_kwargs = {"_attn_implementation": "eager"}
    with hf_runner(model, dtype=dtype,
                   model_kwargs=hf_model_kwargs) as hf_model:
        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

        all_outputs = []
        for inputs in all_inputs:
            # Based on: https://github.com/TIGER-AI-Lab/VLM2Vec/blob/db3b951bccabba220c1f53ab46a734e50dd2fc08/src/model.py
            outputs = hf_model.model(
                **hf_model.wrap_device(inputs,
                                       device=hf_model.model.device.type),
                return_dict=True,
                output_hidden_states=True,
            )
            last_hidden_state = outputs.hidden_states[-1][0]
            reps = last_hidden_state[inputs.attention_mask[0].sum() - 1]
            pooled_output = F.normalize(reps, p=2, dim=-1)

            all_outputs.append(pooled_output.tolist())

        hf_outputs = all_outputs

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )
Cyrus Leung's avatar
Cyrus Leung committed
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [
        (text, asset.pil_image)
        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )