test_gguf.py 6.25 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
"""
Tests gguf models against unquantized models generations
Note: To pass the test, quantization higher than Q4 should be used
"""

import os
9
from typing import NamedTuple
10
11
12

import pytest
from huggingface_hub import hf_hub_download
13
from pytest import MarkDecorator
14
from transformers import AutoTokenizer
15
16
17

from tests.quantization.utils import is_quant_method_supported

18
19
20
from ...conftest import VllmRunner
from ...utils import multi_gpu_test
from ..utils import check_logprobs_close
zhuwenwen's avatar
zhuwenwen committed
21
from ...utils import models_path_prefix
22
23
24
25
26
27

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024


28
29
30
31
class GGUFTestConfig(NamedTuple):
    original_model: str
    gguf_repo: str
    gguf_filename: str
32
    marks: list[MarkDecorator] = []
33
34
35
36
37
38
39

    @property
    def gguf_model(self):
        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)


LLAMA_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
40
41
42
    original_model=os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
    gguf_repo=os.path.join(models_path_prefix,"bartowski/Llama-3.2-1B-Instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix,"Llama-3.2-1B-Instruct-Q6_K.gguf"),
43
44
45
)

QWEN2_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
46
47
48
    original_model=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
    gguf_repo=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "qwen2.5-1.5b-instruct-q6_k.gguf"),
49
50
51
)

PHI3_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
52
53
54
    original_model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-mini-instruct"),
    gguf_repo=os.path.join(models_path_prefix, "bartowski/Phi-3.5-mini-instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "Phi-3.5-mini-instruct-IQ4_XS.gguf"),
55
56
57
)

GPT2_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
58
59
60
    original_model=os.path.join(models_path_prefix, "openai-community/gpt2-large"),
    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/gpt2-large-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "gpt2-large.Q4_K_M.gguf"),
61
62
63
)

STABLELM_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
64
65
66
    original_model=os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
    gguf_repo=os.path.join(models_path_prefix, "afrideva/stablelm-3b-4e1t-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "stablelm-3b-4e1t.q4_k_m.gguf"),
67
68
69
)

STARCODER_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
70
71
72
    original_model=os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/starcoder2-3b-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "starcoder2-3b.Q6_K.gguf"),
73
74
)

75
76
DOLPHIN_CONFIG = GGUFTestConfig(
    # Test VocabParallelEmbedding sharding issue.
zhuwenwen's avatar
zhuwenwen committed
77
78
79
    original_model=os.path.join(models_path_prefix, "cognitivecomputations/TinyDolphin-2.8-1.1b"),
    gguf_repo=os.path.join(models_path_prefix, "tsunemoto/TinyDolphin-2.8-1.1b-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "tinydolphin-2.8-1.1b.Q6_K.gguf"),
80
81
)

82
MODELS = [
83
84
85
86
87
88
    LLAMA_CONFIG,
    QWEN2_CONFIG,
    PHI3_CONFIG,
    GPT2_CONFIG,
    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
    DOLPHIN_CONFIG,
89
90
91
92
    # STARCODER_CONFIG, # broken
]


93
def check_model_outputs(
94
    vllm_runner: type[VllmRunner],
95
    prompts: list[str],
96
    model: GGUFTestConfig,
97
98
99
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
100
    tp_size: int,
101
):
102
103
104
105
106
    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
    if tokenizer.chat_template is not None:
        messages = [[{
            'role': 'user',
            'content': prompt
107
108
109
110
        }] for prompt in prompts]
        prompts = tokenizer.apply_chat_template(messages,
                                                tokenize=False,
                                                add_generation_prompt=True)
111

112
    # Run gguf model.
113
    with vllm_runner(model_name=model.gguf_model,
114
                     enforce_eager=True,
115
                     tokenizer_name=model.original_model,
116
117
                     dtype=dtype,
                     max_model_len=MAX_MODEL_LEN,
118
                     tensor_parallel_size=tp_size) as gguf_model:
119
        gguf_outputs = gguf_model.generate_greedy_logprobs(
120
            prompts[:-1], max_tokens, num_logprobs)
121

122
    # Run unquantized model.
123
124
    # Should run with tp=1, otherwise the test will stuck at
    # nccl initialization.
125
126
127
128
129
    with vllm_runner(
            model_name=model.original_model,
            enforce_eager=True,  # faster tests
            dtype=dtype,
            max_model_len=MAX_MODEL_LEN,
130
            tensor_parallel_size=1) as original_model:
131
        original_outputs = original_model.generate_greedy_logprobs(
132
            prompts[:-1], max_tokens, num_logprobs)
133

134
135
136
137
138
139
    check_logprobs_close(
        outputs_0_lst=original_outputs,
        outputs_1_lst=gguf_outputs,
        name_0="original",
        name_1="gguf",
    )
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [
    pytest.param(test_config, marks=test_config.marks)
    for test_config in MODELS
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1])
def test_models(
    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tp_size: int,
) -> None:
    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
                        num_logprobs, tp_size)


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [2])
@multi_gpu_test(num_gpus=2)
def test_distributed(
    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tp_size: int,
) -> None:
    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
                        num_logprobs, tp_size)