test_models.py 2.79 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
Woosuk Kwon's avatar
Woosuk Kwon committed
2
3
"""Compare the outputs of HF and vLLM when using greedy sampling.

4
Run `pytest tests/models/test_models.py`.
Woosuk Kwon's avatar
Woosuk Kwon committed
5
6
7
"""
import pytest

8
from ...utils import check_logprobs_close
9

Woosuk Kwon's avatar
Woosuk Kwon committed
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@pytest.mark.parametrize(
    "model",
    [
        pytest.param(
            "bigscience/bloom-560m",  # bloom - testing alibi slopes
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
            "openai-community/gpt2",  # gpt2
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
        pytest.param(
            "google/gemma-1.1-2b-it",  # gemma
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
            "meta-llama/Llama-3.2-1B-Instruct",  # llama
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
            "openbmb/MiniCPM3-4B",
            # fused_moe not supported on CPU
            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            "facebook/opt-125m",  # opt
            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
        ),
        pytest.param(
            "microsoft/phi-2",  # phi
            marks=[pytest.mark.core_model],
        ),
        pytest.param(
            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
            marks=[pytest.mark.core_model],
        ),
        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
52
53
54
55
        pytest.param(
            "ehristoforu/Falcon3-MoE-2x7B-Insruct",  # mixtral
            marks=[pytest.mark.cpu_model],
        )
56
57
    ])
@pytest.mark.parametrize("dtype", ["half"])
58
59
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
Woosuk Kwon's avatar
Woosuk Kwon committed
60
61
62
63
64
65
66
def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
    dtype: str,
    max_tokens: int,
67
    num_logprobs: int,
Woosuk Kwon's avatar
Woosuk Kwon committed
68
) -> None:
69

70
    with hf_runner(model, dtype=dtype) as hf_model:
71
72
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs)
Woosuk Kwon's avatar
Woosuk Kwon committed
73

74
    with vllm_runner(model, dtype=dtype) as vllm_model:
75
76
        vllm_outputs = vllm_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)
77

78
79
        # This test is for verifying whether the model's extra_repr
        # can be printed correctly.
80
81
82
83
        def print_model(model):
            print(model)

        vllm_model.apply_model(print_model)
Woosuk Kwon's avatar
Woosuk Kwon committed
84

85
    check_logprobs_close(
86
87
88
89
90
        outputs_0_lst=hf_outputs,
        outputs_1_lst=vllm_outputs,
        name_0="hf",
        name_1="vllm",
    )