test_gguf.py 3.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
"""
Tests gguf models against unquantized models generations
Note: To pass the test, quantization higher than Q4 should be used
"""

import os

import pytest
from huggingface_hub import hf_hub_download
10
from transformers import AutoTokenizer
11
12
13

from tests.quantization.utils import is_quant_method_supported

14
from ...utils import check_logprobs_close
15
from ....utils import models_path_prefix
16
17
18
19
20
21
22
23

os.environ["TOKENIZERS_PARALLELISM"] = "true"

MAX_MODEL_LEN = 1024


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
                    reason="gguf is not supported on this GPU type.")
24
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
zhuwenwen's avatar
zhuwenwen committed
25
26
27
28
29
30
31
32
33
34
    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
     os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
     os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
    (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
     os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
     os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GGUF"),
     os.path.join(models_path_prefix, "qwen2-1_5b-instruct-q4_k_m.gguf")),
    (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF"),
     os.path.join(models_path_prefix, "Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
35
])
36
37
38
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
39
@pytest.mark.parametrize("tp_size", [1, 2])
40
def test_models(
41
    num_gpus_available,
42
43
    vllm_runner,
    example_prompts,
44
45
46
    original_model,
    gguf_id,
    gguf_path,
47
48
49
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
50
    tp_size: int,
51
) -> None:
52
53
54
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

55
    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
56

57
58
59
60
61
62
63
64
65
    tokenizer = AutoTokenizer.from_pretrained(original_model)
    messages = [[{
        'role': 'user',
        'content': prompt
    }] for prompt in example_prompts]
    example_prompts = tokenizer.apply_chat_template(messages,
                                                    tokenize=False,
                                                    add_generation_prompt=True)

66
67
68
69
    # Run unquantized model.
    with vllm_runner(model_name=original_model,
                     dtype=dtype,
                     max_model_len=MAX_MODEL_LEN,
70
                     tensor_parallel_size=tp_size) as original_model:
71
72
73
74
75
76
77
78

        original_outputs = original_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)

    # Run gguf model.
    with vllm_runner(model_name=gguf_model,
                     dtype=dtype,
                     max_model_len=MAX_MODEL_LEN,
79
                     tensor_parallel_size=tp_size) as gguf_model:
80
81
82
83
84
85
86
87
88
        gguf_outputs = gguf_model.generate_greedy_logprobs(
            example_prompts[:-1], max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=original_outputs,
        outputs_1_lst=gguf_outputs,
        name_0="original",
        name_1="gguf",
    )