test_transformers.py 4.64 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Test the functionality of the Transformers backend."""
4
5
from typing import Any, Optional, Union

6

zhuwenwen's avatar
zhuwenwen committed
7
import os
8
9
import pytest

10
11
from vllm.platforms import current_platform

12
from ..conftest import HfRunner, VllmRunner
13
from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
zhuwenwen's avatar
zhuwenwen committed
14
from ..utils import multi_gpu_test, models_path_prefix
15
16
17
18
from .utils import check_logprobs_close


def check_implementation(
19
20
    runner_ref: type[Union[HfRunner, VllmRunner]],
    runner_test: type[VllmRunner],
21
22
    example_prompts: list[str],
    model: str,
23
24
    kwargs_ref: Optional[dict[str, Any]] = None,
    kwargs_test: Optional[dict[str, Any]] = None,
25
26
    **kwargs,
):
27
28
29
30
31
    if kwargs_ref is None:
        kwargs_ref = {}
    if kwargs_test is None:
        kwargs_test = {}

32
33
34
    max_tokens = 32
    num_logprobs = 5

35
36
37
38
    args = (example_prompts, max_tokens, num_logprobs)

    with runner_test(model, **kwargs_test, **kwargs) as model_test:
        outputs_test = model_test.generate_greedy_logprobs(*args)
39

40
41
42
43
44
    with runner_ref(model, **kwargs_ref) as model_ref:
        if isinstance(model_ref, VllmRunner):
            outputs_ref = model_ref.generate_greedy_logprobs(*args)
        else:
            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
45
46

    check_logprobs_close(
47
48
49
50
        outputs_0_lst=outputs_ref,
        outputs_1_lst=outputs_test,
        name_0="ref",
        name_1="test",
51
52
53
    )


54
55
56
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.")
57
58
59
@pytest.mark.parametrize(
    "model,model_impl",
    [
zhuwenwen's avatar
zhuwenwen committed
60
61
        (os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
        (os.path.join(models_path_prefix,"ArthurZ/Ilama-3.2-1B", "auto")),  # CUSTOM CODE
62
    ])  # trust_remote_code=True by default
63
def test_models(
64
65
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
66
67
68
69
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
70
71
72
73
74
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
75
76


77
78
79
80
81
82
83
def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
    prompts, _, _ = prep_prompts(4, (800, 801))
    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
    check_implementation(vllm_runner,
                         vllm_runner,
                         prompts,
zhuwenwen's avatar
zhuwenwen committed
84
                         model=os.path.join(models_path_prefix,"hmellor/tiny-random-Gemma2ForCausalLM"),
85
86
87
88
                         kwargs_ref=kwargs_ref,
                         kwargs_test=kwargs_test)


89
90
@multi_gpu_test(num_gpus=2)
def test_distributed(
91
92
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
93
94
95
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
96
97
98
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
zhuwenwen's avatar
zhuwenwen committed
99
                         os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
100
                         kwargs_test=kwargs)
101
102


103
104
105
@pytest.mark.skipif(
    current_platform.is_rocm(),
    reason="bitsandbytes quantization is currently not supported in rocm.")
106
107
@pytest.mark.parametrize("model, quantization_kwargs", [
    (
zhuwenwen's avatar
zhuwenwen committed
108
        os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
109
110
111
112
113
114
115
116
        {
            "quantization": "bitsandbytes",
        },
    ),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
117
    vllm_runner: type[VllmRunner],
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
    example_prompts: list[str],
    model: str,
    quantization_kwargs: dict[str, str],
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(
            model, model_impl="auto", enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)

    with vllm_runner(
            model,
            model_impl="transformers",
            enforce_eager=True,
            **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
        transformers_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
    check_logprobs_close(
        outputs_0_lst=transformers_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
zhuwenwen's avatar
zhuwenwen committed
142
    )