test_transformers.py 7.91 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers modeling backend."""
4

5
from typing import Any
6

7

zhuwenwen's avatar
zhuwenwen committed
8
import os
9
10
import pytest

11
12
from vllm.platforms import current_platform

13
from ..conftest import HfRunner, VllmRunner
14
from ..utils import multi_gpu_test, prep_prompts, models_path_prefix
15
from .registry import HF_EXAMPLE_MODELS
16
from .utils import check_embeddings_close, check_logprobs_close
17
18


19
20
21
22
23
24
def get_model(arch: str) -> str:
    model_info = HF_EXAMPLE_MODELS.get_hf_info(arch)
    model_info.check_transformers_version(on_fail="skip")
    return model_info.default


25
def check_implementation(
26
    runner_ref: type[HfRunner | VllmRunner],
27
    runner_test: type[VllmRunner],
28
29
    example_prompts: list[str],
    model: str,
30
31
    kwargs_ref: dict[str, Any] | None = None,
    kwargs_test: dict[str, Any] | None = None,
32
33
    **kwargs,
):
34
35
36
37
38
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

39
40
41
    max_tokens = 32
    num_logprobs = 5

42
43
44
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
45
        model_config = model_test.llm.llm_engine.model_config
46
        assert model_config.using_transformers_backend()
47

48
        outputs_test = model_test.generate_greedy_logprobs(*args)
49

50
51
52
53
54
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
55
56

    check_logprobs_close(
57
58
59
60
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
61
62
63
64
65
66
    )


@pytest.mark.parametrize(
    "model,model_impl",
    [
67
68
        (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
        (os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), "auto"),  # CUSTOM CODE
69
        (os.path.join(models_path_prefix, "allenai/OLMoE-1B-7B-0924"), "transformers"),  # MoE
70
71
    ],
)  # trust_remote_code=True by default
72
def test_models(
73
74
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
75
76
77
78
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
79
80
    import transformers
    from packaging.version import Version
81

82
    installed = Version(transformers.__version__)
83
    required = Version("5.0.0")
84
    if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
85
        pytest.skip(
86
            "MoE models with the Transformers modeling backend require "
87
88
            f"transformers>={required}, but got {installed}"
        )
89

90
91
92
    check_implementation(
        hf_runner, vllm_runner, example_prompts, model, model_impl=model_impl
    )
93
94


95
96
97
98
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
99
100
101
102
    check_implementation(
        vllm_runner,
        vllm_runner,
        prompts,
103
        model=os.path.join(models_path_prefix, "hmellor/tiny-random-Gemma2ForCausalLM"),
104
105
106
        kwargs_ref=kwargs_ref,
        kwargs_test=kwargs_test,
    )
107
108


109
110
@multi_gpu_test(num_gpus=2)
def test_distributed(
111
112
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
113
114
115
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
116
117
118
119
    check_implementation(
        hf_runner,
        vllm_runner,
        example_prompts,
120
        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
121
122
        kwargs_test=kwargs,
    )
123
124


125
126
127
@pytest.mark.parametrize(
    "model, quantization_kwargs",
    [
128
129
        (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {}),
        (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {}),
130
        (
131
            os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
132
133
134
135
136
137
            {
                "quantization": "bitsandbytes",
            },
        ),
    ],
)
138
139
140
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
141
    vllm_runner: type[VllmRunner],
142
143
144
145
146
147
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
148
149
150
151
152
    if (
        current_platform.is_rocm()
        and quantization_kwargs.get("quantization", "") == "bitsandbytes"
    ):
        pytest.skip("bitsandbytes quantization is currently not supported in rocm.")
153

154
    with vllm_runner(
155
156
157
158
159
        model,
        model_impl="auto",
        enforce_eager=True,
        **quantization_kwargs,  # type: ignore[arg-type]
    ) as vllm_model:
160
        vllm_outputs = vllm_model.generate_greedy_logprobs(
161
162
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
        )
163
164

    with vllm_runner(
165
166
167
168
169
        model,
        model_impl="transformers",
        enforce_eager=True,
        **quantization_kwargs,  # type: ignore[arg-type]
    ) as vllm_model:
170
        model_config = vllm_model.llm.llm_engine.model_config
171
        assert model_config.using_transformers_backend()
172

173
        transformers_outputs = vllm_model.generate_greedy_logprobs(
174
175
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs
        )
176

177
178
179
180
181
182
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
    )
183
184


185
186
187
188
@pytest.mark.parametrize(
    "model",
    [
        # Layers live in `layers`
189
        os.path.join(models_path_prefix, "Qwen/Qwen3-Embedding-0.6B"),
190
        # Layers live in `model.layers`
191
        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
192
193
194
    ],
)
def test_embed_loading(vllm_runner, model):
195
196
197
198
199
200
201
    with vllm_runner(
        model,
        max_model_len=1024,
        enforce_eager=True,
        runner="pooling",
        model_impl="transformers",
    ) as model_test:
202
203
204
205
        model_config = model_test.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()


206
@pytest.mark.parametrize(
207
208
    "arch", ["TransformersEmbeddingModel", "TransformersForSequenceClassification"]
)
209
210
211
def test_pooling(hf_runner, vllm_runner, example_prompts, arch):
    model = get_model(arch)

212
    vllm_kwargs = dict(max_model_len=None, model_impl="transformers")
213
214
215
216
217
218

    hf_kwargs = dict()
    if arch == "TransformersEmbeddingModel":
        hf_kwargs["is_sentence_transformer"] = True
    elif arch == "TransformersForSequenceClassification":
        from transformers import AutoModelForSequenceClassification
219

220
221
222
223
224
225
226
227
228
229
        hf_kwargs["auto_cls"] = AutoModelForSequenceClassification

    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
    # This makes the input_ids different between hf_model and vllm_model.
    # So we need to strip the input texts to avoid test failing.
    example_prompts = [str(s).strip() for s in example_prompts]

230
231
232
233
    with (
        vllm_runner(model, **vllm_kwargs) as vllm_model,
        hf_runner(model, **hf_kwargs) as hf_model,
    ):
234
235
236
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.using_transformers_backend()

237
238
239
240
241
242
        if arch == "TransformersEmbeddingModel":
            vllm_outputs = vllm_model.embed(example_prompts)
            hf_outputs = hf_model.encode(example_prompts)
        elif arch == "TransformersForSequenceClassification":
            vllm_outputs = vllm_model.classify(example_prompts)
            hf_outputs = hf_model.classify(example_prompts)
243
244
245
246
247
248
249

    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )