"vllm/vscode:/vscode.git/clone" did not exist on "abfe705a02160db53f4b0cf90c7b016f04291b9c"
test_llava_next.py 4.84 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

Cyrus Leung's avatar
Cyrus Leung committed
4
5
import pytest
import torch.nn.functional as F
6
from transformers import AutoModelForImageTextToText
Cyrus Leung's avatar
Cyrus Leung committed
7

8
9
from vllm.platforms import current_platform

Cyrus Leung's avatar
Cyrus Leung committed
10
11
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
12
from ...utils import check_embeddings_close
Cyrus Leung's avatar
Cyrus Leung committed
13

14
15
16
17
18
19
20
21
22
23
24
25
26
# Llava Next embedding implementation is only supported by CUDA.
# If run on ROCm, hf_model.model.resize_token_embeddings will
# cause the following error:
#    RuntimeError: Calling torch.linalg.cholesky on a CUDA tensor
#    requires compiling PyTorch with MAGMA. Please use PyTorch
#    built with MAGMA support.
# If run on CPU, hf_model.model.resize_token_embeddings will
# cause the following error:
#    RuntimeError: Calling torch.linalg.cholesky on a CPU tensor
#    requires compiling PyTorch with LAPACK. Please use PyTorch
#    built with LAPACK support.
pytestmark = pytest.mark.skipif(
    not current_platform.is_cuda(),
27
28
    reason="Llava Next model uses op that is only supported in CUDA",
)
29

30
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
Cyrus Leung's avatar
Cyrus Leung committed
31
32
33
34
35
36
37

HF_TEXT_PROMPTS = [
    # T -> X
    llama3_template.format(
        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
    ),
    # T -> X
38
    llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
Cyrus Leung's avatar
Cyrus Leung committed
39
40
]

41
42
43
44
45
46
47
48
49
50
51
52
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
    {
        # I -> X
        "stop_sign": llama3_template.format(
            "<image>\nSummary above image in one word: "
        ),
        # I -> X
        "cherry_blossom": llama3_template.format(
            "<image>\nSummary above image in one word: "
        ),
    }
)
Cyrus Leung's avatar
Cyrus Leung committed
53
54
55
56
57

MODELS = ["royokong/e5-v"]


def _run_test(
58
59
60
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
    input_texts: list[str],
Cyrus Leung's avatar
Cyrus Leung committed
61
62
63
64
65
66
67
68
69
    input_images: PromptImageInput,
    model: str,
    *,
    dtype: str,
) -> None:
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
    # if we run HF first, the cuda initialization will be done and it
    # will hurt multiprocessing backend with fork method (the default method).
70
71
72
    with vllm_runner(
        model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
    ) as vllm_model:
73
        vllm_outputs = vllm_model.embed(input_texts, images=input_images)
Cyrus Leung's avatar
Cyrus Leung committed
74

75
76
77
    with hf_runner(
        model, dtype=dtype, auto_cls=AutoModelForImageTextToText
    ) as hf_model:
78
        # Patch the issue where generation_config.json is missing
79
        hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
80

Cyrus Leung's avatar
Cyrus Leung committed
81
82
83
        # Patch the issue where image_token_id
        # exceeds the maximum allowed vocab size
        hf_model.model.resize_token_embeddings(
84
            hf_model.model.model.language_model.vocab_size + 1
85
        )
Cyrus Leung's avatar
Cyrus Leung committed
86
87
88
89
90
91
92

        all_inputs = hf_model.get_inputs(input_texts, images=input_images)

        all_outputs = []
        for inputs in all_inputs:
            # Based on: https://huggingface.co/royokong/e5-v
            outputs = hf_model.model(
93
                **hf_model.wrap_device(inputs),
Cyrus Leung's avatar
Cyrus Leung committed
94
95
96
                return_dict=True,
                output_hidden_states=True,
            )
97
            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
Cyrus Leung's avatar
Cyrus Leung committed
98
99
100
101
102
103
104
105
106
107
108
109
110

            all_outputs.append(pooled_output.tolist())

        hf_outputs = all_outputs

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


111
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,  # type: ignore
        model,
        dtype=dtype,
    )


@large_gpu_test(min_gb=48)
136
@pytest.mark.core_model
Cyrus Leung's avatar
Cyrus Leung committed
137
138
139
140
141
142
143
144
145
146
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
    hf_runner,
    vllm_runner,
    image_assets,
    model: str,
    dtype: str,
) -> None:
    input_texts_images = [
147
        (text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
Cyrus Leung's avatar
Cyrus Leung committed
148
149
150
151
152
153
154
155
156
157
158
159
    ]
    input_texts = [text for text, _ in input_texts_images]
    input_images = [image for _, image in input_texts_images]

    _run_test(
        hf_runner,
        vllm_runner,
        input_texts,
        input_images,
        model,
        dtype=dtype,
    )