test_gguf.py 6.16 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
3
4
5
6
7
"""
Tests gguf models against unquantized models generations
Note: To pass the test, quantization higher than Q4 should be used
"""

import os
8
from typing import NamedTuple
9
10
11

import pytest
from huggingface_hub import hf_hub_download
12
from pytest import MarkDecorator
13
from transformers import AutoTokenizer
14
15
16

from tests.quantization.utils import is_quant_method_supported

17
from ....conftest import VllmRunner
18
from ....utils import multi_gpu_test
19
from ...utils import check_logprobs_close
20
from ....utils import models_path_prefix
21
22
23
24
25
26

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024


27
28
29
30
class GGUFTestConfig(NamedTuple):
    original_model: str
    gguf_repo: str
    gguf_filename: str
31
    marks: list[MarkDecorator] = []
32
33
34
35
36
37
38

    @property
    def gguf_model(self):
        return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)


LLAMA_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
39
40
41
    original_model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
    gguf_repo=os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
42
    marks=[pytest.mark.quant_model],
43
44
45
)

QWEN2_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
46
47
48
    original_model=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
    gguf_repo=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "qwen2.5-1.5b-instruct-q6_k.gguf"),
49
50
51
)

PHI3_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
52
53
54
    original_model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-mini-instruct"),
    gguf_repo=os.path.join(models_path_prefix, "bartowski/Phi-3.5-mini-instruct-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "Phi-3.5-mini-instruct-IQ4_XS.gguf"),
55
56
57
)

GPT2_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
58
59
60
    original_model=os.path.join(models_path_prefix, "openai-community/gpt2-large"),
    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/gpt2-large-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "gpt2-large.Q4_K_M.gguf"),
61
62
63
)

STABLELM_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
64
65
66
    original_model=os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
    gguf_repo=os.path.join(models_path_prefix, "afrideva/stablelm-3b-4e1t-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "stablelm-3b-4e1t.q4_k_m.gguf"),
67
68
69
)

STARCODER_CONFIG = GGUFTestConfig(
zhuwenwen's avatar
zhuwenwen committed
70
71
72
    original_model=os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
    gguf_repo=os.path.join(models_path_prefix, "QuantFactory/starcoder2-3b-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "starcoder2-3b.Q6_K.gguf"),
73
74
)

75
76
DOLPHIN_CONFIG = GGUFTestConfig(
    # Test VocabParallelEmbedding sharding issue.
zhuwenwen's avatar
zhuwenwen committed
77
78
79
    original_model=os.path.join(models_path_prefix, "cognitivecomputations/TinyDolphin-2.8-1.1b"),
    gguf_repo=os.path.join(models_path_prefix, "tsunemoto/TinyDolphin-2.8-1.1b-GGUF"),
    gguf_filename=os.path.join(models_path_prefix, "tinydolphin-2.8-1.1b.Q6_K.gguf"),
80
81
)

82
MODELS = [
83
    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
84
    DOLPHIN_CONFIG
85
86
87
88
    # STARCODER_CONFIG, # broken
]


89
def check_model_outputs(
90
    vllm_runner: type[VllmRunner],
91
    prompts: list[str],
92
    model: GGUFTestConfig,
93
94
95
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
96
    tp_size: int,
97
):
98
99
100
101
102
    tokenizer = AutoTokenizer.from_pretrained(model.original_model)
    if tokenizer.chat_template is not None:
        messages = [[{
            'role': 'user',
            'content': prompt
103
104
105
106
        }] for prompt in prompts]
        prompts = tokenizer.apply_chat_template(messages,
                                                tokenize=False,
                                                add_generation_prompt=True)
107

108
    # Run gguf model.
109
    with vllm_runner(model_name=model.gguf_model,
110
                     enforce_eager=True,
111
                     tokenizer_name=model.original_model,
112
113
                     dtype=dtype,
                     max_model_len=MAX_MODEL_LEN,
114
                     tensor_parallel_size=tp_size) as gguf_model:
115
        gguf_outputs = gguf_model.generate_greedy_logprobs(
116
            prompts[:-1], max_tokens, num_logprobs)
117

118
    # Run unquantized model.
119
120
    # Should run with tp=1, otherwise the test will stuck at
    # nccl initialization.
121
122
123
124
125
    with vllm_runner(
            model_name=model.original_model,
            enforce_eager=True,  # faster tests
            dtype=dtype,
            max_model_len=MAX_MODEL_LEN,
126
            tensor_parallel_size=1) as original_model:
127
        original_outputs = original_model.generate_greedy_logprobs(
128
            prompts[:-1], max_tokens, num_logprobs)
129

130
131
132
133
134
135
    check_logprobs_close(
        outputs_0_lst=original_outputs,
        outputs_1_lst=gguf_outputs,
        name_0="original",
        name_1="gguf",
    )
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [
    pytest.param(test_config, marks=test_config.marks)
    for test_config in MODELS
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1])
def test_models(
    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tp_size: int,
) -> None:
    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
                        num_logprobs, tp_size)


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", [LLAMA_CONFIG])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [8])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [2])
@multi_gpu_test(num_gpus=2)
def test_distributed(
    vllm_runner: type[VllmRunner],
    example_prompts: list[str],
    model: GGUFTestConfig,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
    tp_size: int,
) -> None:
    check_model_outputs(vllm_runner, example_prompts, model, dtype, max_tokens,
                        num_logprobs, tp_size)