test_transformers.py 3.21 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
4
5
"""Test the functionality of the Transformers backend.

Run `pytest tests/models/test_transformers.py`.
"""
zhuwenwen's avatar
zhuwenwen committed
6
import os
7
8
9
import pytest

from ..conftest import HfRunner, VllmRunner
zhuwenwen's avatar
zhuwenwen committed
10
from ..utils import multi_gpu_test, models_path_prefix
11
12
13
14
from .utils import check_logprobs_close


def check_implementation(
15
16
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    example_prompts: list[str],
    model: str,
    **kwargs,
):
    max_tokens = 32
    num_logprobs = 5

    with vllm_runner(model, **kwargs) as vllm_model:
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )


@pytest.mark.parametrize(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
    ])  # trust_remote_code=True by default
46
def test_models(
47
48
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
49
50
51
52
    example_prompts: list[str],
    model: str,
    model_impl: str,
) -> None:
53
54
55
56
57
    check_implementation(hf_runner,
                         vllm_runner,
                         example_prompts,
                         model,
                         model_impl=model_impl)
58
59
60
61


@multi_gpu_test(num_gpus=2)
def test_distributed(
62
63
    hf_runner: type[HfRunner],
    vllm_runner: type[VllmRunner],
64
65
66
67
    example_prompts,
):
    kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
    check_implementation(hf_runner, vllm_runner, example_prompts,
zhuwenwen's avatar
zhuwenwen committed
68
                        "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
69
70


zhuwenwen's avatar
zhuwenwen committed
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# @pytest.mark.parametrize("model, quantization_kwargs", [
#     (
#         os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
#         {
#             "quantization": "bitsandbytes",
#         },
#     ),
# ])
# @pytest.mark.parametrize("max_tokens", [32])
# @pytest.mark.parametrize("num_logprobs", [5])
# def test_quantization(
#     vllm_runner: type[VllmRunner],
#     example_prompts: list[str],
#     model: str,
#     quantization_kwargs: dict[str, str],
#     max_tokens: int,
#     num_logprobs: int,
# ) -> None:
#     with vllm_runner(
#             model, model_impl="auto", enforce_eager=True,
#             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
#         vllm_outputs = vllm_model.generate_greedy_logprobs(
#             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
94

zhuwenwen's avatar
zhuwenwen committed
95
96
97
98
99
100
101
102
103
104
105
106
107
#     with vllm_runner(
#             model,
#             model_impl="transformers",
#             enforce_eager=True,
#             **quantization_kwargs) as vllm_model:  # type: ignore[arg-type]
#         transformers_outputs = vllm_model.generate_greedy_logprobs(
#             example_prompts, max_tokens=max_tokens, num_logprobs=num_logprobs)
#     check_logprobs_close(
#         outputs_0_lst=transformers_outputs,
#         outputs_1_lst=vllm_outputs,
#         name_0="transformers",
#         name_1="vllm",
#     )