test_bitblas.py 1.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of a GPTQ model to a bitblas model.

Note: GPTQ and bitblas do not have bitwise correctness.
As a result, in this test, we just confirm that the top selected tokens of the
bitblas/GPTQ models are in the top 3 selections of each other.

Note: bitblas internally uses locks to synchronize the threads. This can
result in very slight nondeterminism for bitblas. As a result, we re-run the 
test up to 3 times to see if we pass.

Run `pytest tests/models/test_bitblas.py`.
"""
from dataclasses import dataclass

import pytest

18
from ..utils import check_logprobs_close
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63


@dataclass
class ModelPair:
    model_bitblas: str
    model_gptq: str


model_pairs = [
    ModelPair(model_bitblas="hxbgsyxh/opt-125m-4bit-128g-bitblas",
              model_gptq="hxbgsyxh/opt-125m-4bit-128g"),
]


@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(True, reason="BitBLAS takes too much time for tuning.")
@pytest.mark.parametrize("model_pair", model_pairs)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
    vllm_runner,
    example_prompts,
    model_pair: ModelPair,
    dtype: str,
    max_tokens: int,
    num_logprobs: int,
) -> None:
    with vllm_runner(model_pair.model_bitblas,
                     dtype=dtype,
                     quantization="bitblas") as bitblas_model:
        bitblas_outputs = bitblas_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    with vllm_runner(model_pair.model_gptq, dtype=dtype,
                     quantization="gptq") as gptq_model:
        gptq_outputs = gptq_model.generate_greedy_logprobs(
            example_prompts, max_tokens, num_logprobs)

    check_logprobs_close(
        outputs_0_lst=gptq_outputs,
        outputs_1_lst=bitblas_outputs,
        name_0="gptq",
        name_1="bitblas",
    )