test_transformers.py 7.73 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers backend."""
4
5
from typing import Any, Optional, Union

6

zhuwenwen's avatar
zhuwenwen committed
7
import os
8
9
import pytest

10
11
from vllm.platforms import current_platform

12
from ..conftest import HfRunner, VllmRunner
13
from ..utils import multi_gpu_test, prep_prompts, models_path_prefix
14
from .utils import check_embeddings_close, check_logprobs_close
15
16
17


def check_implementation(
18
19
    runner_ref: type[Union[HfRunner, VllmRunner]],
    runner_test: type[VllmRunner],
20
21
    example_prompts: list[str],
    model: str,
22
23
    kwargs_ref: Optional[dict[str, Any]] = None,
    kwargs_test: Optional[dict[str, Any]] = None,
24
25
    **kwargs,
):
26
27
28
29
30
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

31
32
33
    max_tokens = 32
    num_logprobs = 5

34
35
36
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
37
        model_config = model_test.llm.llm_engine.model_config
38
        assert model_config.using_transformers_backend()
39

40
        outputs_test = model_test.generate_greedy_logprobs(*args)
41

42
43
44
45
46
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
47
48

    check_logprobs_close(
49
50
51
52
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
53
54
55
    )


56
57
58
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
59
60
61
@pytest.mark.parametrize(
    "model,model_impl",
    [
62
63
        (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
        (os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), "auto"),  # CUSTOM CODE
64
    ])  # trust_remote_code=True by default
65
def test_models(
66
67
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
68
69
70
71
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
72
73
74
75
76
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
77
78


79
80
81
82
83
84
85
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
    check_implementation(vllm_runner,
                         vllm_runner,
                         prompts,
zhuwenwen's avatar
zhuwenwen committed
86
                         model=os.path.join(models_path_prefix,"hmellor/tiny-random-Gemma2ForCausalLM"),
87
88
89
90
                         kwargs_ref=kwargs_ref,
                         kwargs_test=kwargs_test)


91
92
@multi_gpu_test(num_gpus=2)
def test_distributed(
93
94
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
95
96
97
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
98
99
100
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
zhuwenwen's avatar
zhuwenwen committed
101
                         os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
102
                         kwargs_test=kwargs)
103
104


105
106
107
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="bitsandbytes quantization is currently not supported in rocm.")
108
109
@pytest.mark.parametrize("model, quantization_kwargs", [
    (
110
        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
111
112
113
114
115
116
117
118
        {
            "quantization": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
119
    vllm_runner: type[VllmRunner],
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
137
        model_config = vllm_model.llm.llm_engine.model_config
138
        assert model_config.using_transformers_backend()
139

140
141
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
142

143
144
145
146
147
148
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
149
150


151
152
153
154
@pytest.mark.parametrize(
    "model",
    [
        # Layers live in `layers`
155
        os.path.join(models_path_prefix, "Qwen/Qwen3-Embedding-0.6B"),
156
        # Layers live in `model.layers`
157
        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
158
159
160
161
162
163
164
165
166
167
168
169
    ],
)
def test_embed_loading(vllm_runner, model):
    with vllm_runner(model,
                     max_model_len=1024,
                     enforce_eager=True,
                     runner="pooling",
                     model_impl="transformers") as model_test:
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


170
171
172
173
@pytest.mark.parametrize(
    "model",
    [
        # Encoder model
174
        os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    ])
def test_embed_correctness(hf_runner, vllm_runner, example_prompts, model):
    import transformers
    from packaging.version import Version
    installed = Version(transformers.__version__)
    required = Version("4.57.0.dev0")
    if installed < required:
        pytest.skip("Encoder models with the Transformers backend require "
                    f"transformers>={required}, but got {installed}")

    with vllm_runner(model, max_model_len=512,
                     model_impl="transformers") as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

        vllm_outputs = vllm_model.embed(example_prompts)

    with hf_runner(model, is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
        tol=1e-2,
    )


204
205
@pytest.mark.parametrize(
    "model",
206
    [os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")],
207
)
208
@pytest.mark.parametrize("dtype", ["float"])
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def test_classify(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
) -> None:
    import torch
    from transformers import AutoModelForSequenceClassification

    with vllm_runner(model,
                     max_model_len=512,
                     dtype=dtype,
                     model_impl="transformers") as vllm_model:
223
        model_config = vllm_model.llm.llm_engine.model_config
224
        assert model_config.using_transformers_backend()
225

226
227
228
229
230
231
232
233
234
235
236
237
238
        vllm_outputs = vllm_model.classify(example_prompts)

    with hf_runner(model,
                   dtype=dtype,
                   auto_cls=AutoModelForSequenceClassification) as hf_model:
        hf_outputs = hf_model.classify(example_prompts)

    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
        hf_output = torch.tensor(hf_output)
        vllm_output = torch.tensor(vllm_output)

        assert torch.allclose(hf_output, vllm_output,
                              1e-3 if dtype == "float" else 1e-2)