test_llava_next.py 4.83 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0

Cyrus Leung's avatar
Cyrus Leung committed
3
4
import pytest
import torch.nn.functional as F
5
from transformers import AutoModelForImageTextToText
Cyrus Leung's avatar
Cyrus Leung committed
6

7
8
from vllm.platforms import current_platform

Cyrus Leung's avatar
Cyrus Leung committed
9
10
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
11
from ...utils import check_embeddings_close
Cyrus Leung's avatar
Cyrus Leung committed
12

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will
# cause the following error:
#    RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
#    requires compiling PyTorch with MAGMA. Please use PyTorch
#    built with MAGMA support.
# If run on CPU, hf_model.model.resize_token_embeddings will
# cause the following error:
#    RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
#    requires compiling PyTorch with LAPACK. Please use PyTorch
#    built with LAPACK support.
pytestmark = pytest.mark.skipif(
    not current_platform.is_cuda(),
    reason="Llava Next model uses op that is only supported in CUDA")

Cyrus Leung's avatar
Cyrus Leung committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501

HF_TEXT_PROMPTS = [
    # T -> X
    llama3_template.format(
        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
    ),
    # T -> X
    llama3_template.format(
        "cherry blossom\nSummary above sentence in one word: "),
]

HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    # I -> X
    "stop_sign":
    llama3_template.format("<image>\nSummary above image in one word: "),
    # I -> X
    "cherry_blossom":
    llama3_template.format("<image>\nSummary above image in one word: "),
})

MODELS = ["royokong/e5-v"]


def _run_test(
53
54
55
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    input_texts: list[str],
Cyrus Leung's avatar
Cyrus Leung committed
56
57
58
59
60
61
62
63
64
65
    input_images: PromptImageInput,
    model: str,
    *,
    dtype: str,
) -> None:
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
    with vllm_runner(model,
66
                     task="embed",
Cyrus Leung's avatar
Cyrus Leung committed
67
68
69
70
71
72
                     dtype=dtype,
                     max_model_len=4096,
                     enforce_eager=True) as vllm_model:
        vllm_outputs = vllm_model.encode(input_texts, images=input_images)

    with hf_runner(model, dtype=dtype,
73
                   auto_cls=AutoModelForImageTextToText) as hf_model:
74
75
76
77
        # Patch the issue where generation_config.json is missing
        hf_model.processor.patch_size = \
            hf_model.model.config.vision_config.patch_size

Cyrus Leung's avatar
Cyrus Leung committed
78
79
80
81
82
83
84
85
86
87
88
        # Patch the issue where image_token_id
        # exceeds the maximum allowed vocab size
        hf_model.model.resize_token_embeddings(
            hf_model.model.language_model.vocab_size + 1)

        all_inputs = hf_model.get_inputs(input_texts, images=input_images)

        all_outputs = []
        for inputs in all_inputs:
            # Based on: https://huggingface.co/royokong/e5-v
            outputs = hf_model.model(
89
                **hf_model.wrap_device(inputs),
Cyrus Leung's avatar
Cyrus Leung committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
                return_dict=True,
                output_hidden_states=True,
            )
            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
                                        dim=-1)

            all_outputs.append(pooled_output.tolist())

        hf_outputs = all_outputs

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


108
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@large_gpu_test(min_gb=48)
133
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [
        (text, asset.pil_image)
        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )