"docs/vscode:/vscode.git/clone" did not exist on "2e2000f352d861dd2b527a48c5f12d295a93c3dd"
test_transformers.py 7.41 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers backend."""
4
5
from typing import Any, Optional, Union

6
7
import pytest

8
9
from vllm.platforms import current_platform

10
from ..conftest import HfRunner, VllmRunner
11
from ..utils import multi_gpu_test, prep_prompts
12
from .utils import check_embeddings_close, check_logprobs_close
13
14
15


def check_implementation(
16
17
    runner_ref: type[Union[HfRunner, VllmRunner]],
    runner_test: type[VllmRunner],
18
19
    example_prompts: list[str],
    model: str,
20
21
    kwargs_ref: Optional[dict[str, Any]] = None,
    kwargs_test: Optional[dict[str, Any]] = None,
22
23
    **kwargs,
):
24
25
26
27
28
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

29
30
31
    max_tokens = 32
    num_logprobs = 5

32
33
34
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
35
        model_config = model_test.llm.llm_engine.model_config
36
        assert model_config.using_transformers_backend()
37

38
        outputs_test = model_test.generate_greedy_logprobs(*args)
39

40
41
42
43
44
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
45
46

    check_logprobs_close(
47
48
49
50
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
51
52
53
    )


54
55
56
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
57
58
59
60
@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
61
        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
62
    ])  # trust_remote_code=True by default
63
def test_models(
64
65
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
66
67
68
69
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
70
71
72
73
74
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
75
76


77
78
79
80
81
82
83
84
85
86
87
88
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
    check_implementation(vllm_runner,
                         vllm_runner,
                         prompts,
                         model="hmellor/tiny-random-Gemma2ForCausalLM",
                         kwargs_ref=kwargs_ref,
                         kwargs_test=kwargs_test)


89
90
@multi_gpu_test(num_gpus=2)
def test_distributed(
91
92
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
93
94
95
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
96
97
98
99
100
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         "meta-llama/Llama-3.2-1B-Instruct",
                         kwargs_test=kwargs)
101
102


103
104
105
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="bitsandbytes quantization is currently not supported in rocm.")
106
107
108
109
110
111
112
113
114
115
116
@pytest.mark.parametrize("model, quantization_kwargs", [
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
            "quantization": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
117
    vllm_runner: type[VllmRunner],
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
135
        model_config = vllm_model.llm.llm_engine.model_config
136
        assert model_config.using_transformers_backend()
137

138
139
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
140

141
142
143
144
145
146
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
147
148


149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@pytest.mark.parametrize(
    "model",
    [
        # Layers live in `layers`
        "Qwen/Qwen3-Embedding-0.6B",
        # Layers live in `model.layers`
        "meta-llama/Llama-3.2-1B-Instruct"
    ],
)
def test_embed_loading(vllm_runner, model):
    with vllm_runner(model,
                     max_model_len=1024,
                     enforce_eager=True,
                     runner="pooling",
                     model_impl="transformers") as model_test:
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
@pytest.mark.parametrize(
    "model",
    [
        # Encoder model
        "BAAI/bge-base-en-v1.5",
    ])
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
    import transformers
    from packaging.version import Version
    installed = Version(transformers.__version__)
    required = Version("4.57.0.dev0")
    if installed < required:
        pytest.skip("Encoder models with the Transformers backend require "
                    f"transformers>={required}, but got {installed}")

    with vllm_runner(model, max_model_len=512,
                     model_impl="transformers") as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(model, is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
        tol=1e-2,
    )


202
203
204
205
@pytest.mark.parametrize(
    "model",
    ["jason9693/Qwen2.5-1.5B-apeach"],
)
206
@pytest.mark.parametrize("dtype", ["float"])
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def test_classify(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    import torch
    from transformers import AutoModelForSequenceClassification

    with vllm_runner(model,
                     max_model_len=512,
                     dtype=dtype,
                     model_impl="transformers") as vllm_model:
221
        model_config = vllm_model.llm.llm_engine.model_config
222
        assert model_config.using_transformers_backend()
223

224
225
226
227
228
229
230
231
232
233
234
235
236
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=AutoModelForSequenceClassification) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output)
        vllm_output = torch.tensor(vllm_output)

        assert torch.allclose(hf_output, vllm_output,
                              1e-3 if dtype == "float" else 1e-2)