test_transformers.py 7.5 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers backend."""
4
5
from typing import Any, Optional, Union

6
7
import pytest

8
9
from vllm.platforms import current_platform

10
from ..conftest import HfRunner, VllmRunner
11
from ..utils import multi_gpu_test, prep_prompts
12
from .registry import HF_EXAMPLE_MODELS
13
from .utils import check_embeddings_close, check_logprobs_close
14
15


16
17
18
19
20
21
def get_model(arch: str) -> str:
    model_info = HF_EXAMPLE_MODELS.get_hf_info(arch)
    model_info.check_transformers_version(on_fail="skip")
    return model_info.default


22
def check_implementation(
23
24
    runner_ref: type[Union[HfRunner, VllmRunner]],
    runner_test: type[VllmRunner],
25
26
    example_prompts: list[str],
    model: str,
27
28
    kwargs_ref: Optional[dict[str, Any]] = None,
    kwargs_test: Optional[dict[str, Any]] = None,
29
30
    **kwargs,
):
31
32
33
34
35
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

36
37
38
    max_tokens = 32
    num_logprobs = 5

39
40
41
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
42
        model_config = model_test.llm.llm_engine.model_config
43
        assert model_config.using_transformers_backend()
44

45
        outputs_test = model_test.generate_greedy_logprobs(*args)
46

47
48
49
50
51
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
52
53

    check_logprobs_close(
54
55
56
57
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
58
59
60
    )


61
62
63
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
64
65
66
67
@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
68
        ("hmellor/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
69
    ])  # trust_remote_code=True by default
70
def test_models(
71
72
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
73
74
75
76
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
77
78
79
80
81
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
82
83


84
85
86
87
88
89
90
91
92
93
94
95
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
    check_implementation(vllm_runner,
                         vllm_runner,
                         prompts,
                         model="hmellor/tiny-random-Gemma2ForCausalLM",
                         kwargs_ref=kwargs_ref,
                         kwargs_test=kwargs_test)


96
97
@multi_gpu_test(num_gpus=2)
def test_distributed(
98
99
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
100
101
102
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
103
104
105
106
107
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         "meta-llama/Llama-3.2-1B-Instruct",
                         kwargs_test=kwargs)
108
109
110


@pytest.mark.parametrize("model, quantization_kwargs", [
111
112
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {}),
    ("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {}),
113
114
115
116
117
118
119
120
121
122
    (
        "meta-llama/Llama-3.2-1B-Instruct",
        {
            "quantization": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
123
    vllm_runner: type[VllmRunner],
124
125
126
127
128
129
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
130
131
132
133
134
    if (current_platform.is_rocm()
            and quantization_kwargs.get("quantization", "") == "bitsandbytes"):
        pytest.skip(
            "bitsandbytes quantization is currently not supported in rocm.")

135
136
137
138
139
140
141
142
143
144
145
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
146
        model_config = vllm_model.llm.llm_engine.model_config
147
        assert model_config.using_transformers_backend()
148

149
150
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
151

152
153
154
155
156
157
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
158
159


160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
@pytest.mark.parametrize(
    "model",
    [
        # Layers live in `layers`
        "Qwen/Qwen3-Embedding-0.6B",
        # Layers live in `model.layers`
        "meta-llama/Llama-3.2-1B-Instruct"
    ],
)
def test_embed_loading(vllm_runner, model):
    with vllm_runner(model,
                     max_model_len=1024,
                     enforce_eager=True,
                     runner="pooling",
                     model_impl="transformers") as model_test:
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


179
@pytest.mark.parametrize(
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    "arch",
    ["TransformersEmbeddingModel", "TransformersForSequenceClassification"])
def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
    model = get_model(arch)

    vllm_kwargs = dict(
        max_model_len=None,
        model_impl="transformers",
        compilation_config=dict(cudagraph_capture_sizes=[8]),
    )

    hf_kwargs = dict()
    if arch == "TransformersEmbeddingModel":
        hf_kwargs["is_sentence_transformer"] = True
    elif arch == "TransformersForSequenceClassification":
        from transformers import AutoModelForSequenceClassification
        hf_kwargs["auto_cls"] = AutoModelForSequenceClassification

    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
    # This makes the input_ids different between hf_model and vllm_model.
    # So we need to strip the input texts to avoid test failing.
    example_prompts = [str(s).strip() for s in example_prompts]

    with (vllm_runner(model, **vllm_kwargs) as
          vllm_model, hf_runner(model, **hf_kwargs) as hf_model):
208
209
210
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

211
212
213
214
215
216
        if arch == "TransformersEmbeddingModel":
            vllm_outputs = vllm_model.embed(example_prompts)
            hf_outputs = hf_model.encode(example_prompts)
        elif arch == "TransformersForSequenceClassification":
            vllm_outputs = vllm_model.classify(example_prompts)
            hf_outputs = hf_model.classify(example_prompts)
217
218
219
220
221
222
223

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )