"tests/vscode:/vscode.git/clone" did not exist on "652ba93da36d793e7f3ff8a0ecdb5d6b00107e68"
test_transformers.py 7.59 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers backend."""
4
5
from typing import Any, Optional, Union

6
7
import pytest

8
9
from vllm.platforms import current_platform

10
from ..conftest import HfRunner, VllmRunner
11
from ..utils import multi_gpu_test, prep_prompts
12
from .utils import check_embeddings_close, check_logprobs_close
13
14
15


def check_implementation(
16
17
    runner_ref: type[Union[HfRunner, VllmRunner]],
    runner_test: type[VllmRunner],
18
19
    example_prompts: list[str],
    model: str,
20
21
    kwargs_ref: Optional[dict[str, Any]] = None,
    kwargs_test: Optional[dict[str, Any]] = None,
22
23
    **kwargs,
):
24
25
26
27
28
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

29
30
31
    max_tokens = 32
    num_logprobs = 5

32
33
34
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
35
        model_config = model_test.llm.llm_engine.model_config
36
        assert model_config.using_transformers_backend()
37

38
        outputs_test = model_test.generate_greedy_logprobs(*args)
39

40
41
42
43
44
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
45
46

    check_logprobs_close(
47
48
49
50
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
51
52
53
    )


54
55
56
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
57
58
59
60
@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
61
        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
62
    ])  # trust_remote_code=True by default
63
def test_models(
64
65
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
66
67
68
69
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
70
71
72
73
74
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
75
76


77
78
79
80
81
82
83
84
85
86
87
88
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
    check_implementation(vllm_runner,
                         vllm_runner,
                         prompts,
                         model="hmellor/tiny-random-Gemma2ForCausalLM",
                         kwargs_ref=kwargs_ref,
                         kwargs_test=kwargs_test)


89
90
@multi_gpu_test(num_gpus=2)
def test_distributed(
91
92
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
93
94
95
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
96
97
98
99
100
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         "meta-llama/Llama-3.2-1B-Instruct",
                         kwargs_test=kwargs)
101
102
103


@pytest.mark.parametrize("model, quantization_kwargs", [
104
105
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
106
107
108
109
110
111
112
113
114
115
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
            "quantization": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
116
    vllm_runner: type[VllmRunner],
117
118
119
120
121
122
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
123
124
125
126
127
    if (current_platform.is_rocm()
            and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
        pytest.skip(
            "bitsandbytes quantization is currently not supported in rocm.")

128
129
130
131
132
133
134
135
136
137
138
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
139
        model_config = vllm_model.llm.llm_engine.model_config
140
        assert model_config.using_transformers_backend()
141

142
143
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
144

145
146
147
148
149
150
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
151
152


153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
@pytest.mark.parametrize(
    "model",
    [
        # Layers live in `layers`
        "Qwen/Qwen3-Embedding-0.6B",
        # Layers live in `model.layers`
        "meta-llama/Llama-3.2-1B-Instruct"
    ],
)
def test_embed_loading(vllm_runner, model):
    with vllm_runner(model,
                     max_model_len=1024,
                     enforce_eager=True,
                     runner="pooling",
                     model_impl="transformers") as model_test:
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@pytest.mark.parametrize(
    "model",
    [
        # Encoder model
        "BAAI/bge-base-en-v1.5",
    ])
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
    import transformers
    from packaging.version import Version
    installed = Version(transformers.__version__)
    required = Version("4.57.0.dev0")
    if installed < required:
        pytest.skip("Encoder models with the Transformers backend require "
                    f"transformers>={required}, but got {installed}")

    with vllm_runner(model, max_model_len=512,
                     model_impl="transformers") as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(model, is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
        tol=1e-2,
    )


206
207
208
209
@pytest.mark.parametrize(
    "model",
    ["jason9693/Qwen2.5-1.5B-apeach"],
)
210
@pytest.mark.parametrize("dtype", ["float"])
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def test_classify(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    import torch
    from transformers import AutoModelForSequenceClassification

    with vllm_runner(model,
                     max_model_len=512,
                     dtype=dtype,
                     model_impl="transformers") as vllm_model:
225
        model_config = vllm_model.llm.llm_engine.model_config
226
        assert model_config.using_transformers_backend()
227

228
229
230
231
232
233
234
235
236
237
238
239
240
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=AutoModelForSequenceClassification) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output)
        vllm_output = torch.tensor(vllm_output)

        assert torch.allclose(hf_output, vllm_output,
                              1e-3 if dtype == "float" else 1e-2)